diff --git a/pyprophet/_config.py b/pyprophet/_config.py index 41ec4b1e..bec1da7f 100644 --- a/pyprophet/_config.py +++ b/pyprophet/_config.py @@ -635,6 +635,7 @@ class ExportIOConfig(BaseIOConfig): - "legacy_split": Split TSV files for each run. - "parquet": Single Parquet file with merged results. - "parquet_split": Split Parquet files for each run. + - "library" : .tsv library file out_type (Literal["tsv", "csv"]): Output file type for exported results. transition_quantification (bool): Report aggregated transition-level quantification. max_transition_pep (float): Maximum PEP to retain scored transitions for quantification (requires transition-level scoring). @@ -653,6 +654,7 @@ class ExportIOConfig(BaseIOConfig): top_n (int): Number of top intense features to use for summarization consistent_top (bool): Whether to use same top features across all runs normalization (Literal["none", "median", "medianmedian", "quantile"]): Normalization method + test: bool = False: Whether to enable test mode with deterministic behavior, test mode will sort libraries by precursor, fragmentType, fragmentSeriesNumber and fragmentCharge # OSW: Export to parquet compression_method (Literal["none", "snappy", "gzip", "brotli", "zstd"]): Compression method for parquet files. @@ -662,10 +664,18 @@ class ExportIOConfig(BaseIOConfig): # SqMass: Export to parquet pqp_file (Optional[str]): Path to PQP file for precursor/transition mapping. + + # Export to library + rt_calibration (bool): If True, will use emperical RT values as oppose to the original library RT values + im_calibration (bool): If True, will use emperical IM values as oppose to the original library IM values + intensity_calibration (bool): If True, will use emperical intensity values as oppose to the original library intensity values + min_fragments (int): Minimum number of fragments required to include the peak group in the library, only relevant if intensity_calibration is True + keep_decoys (bool): Whether to keep decoy entries in the library, will only keep decoys that pass the thresholds specified + rt_unit (Literal["iRT", "RT"], default = 'iRT') = "iRT": Unit of retention time in the library, only relevant if rt_calibration is True. If "iRT" is selected, the retention times will be scaled to the iRT scale (0-100) in the library """ export_format: Literal[ - "matrix", "legacy_merged", "legacy_split", "parquet", "parquet_split" + "matrix", "legacy_merged", "legacy_split", "parquet", "parquet_split", "library" ] = "legacy_merged" out_type: Literal["tsv", "csv"] = "tsv" transition_quantification: bool = False @@ -677,6 +687,7 @@ class ExportIOConfig(BaseIOConfig): max_global_peptide_qvalue: float = 0.01 protein: bool = True max_global_protein_qvalue: float = 0.01 + test: bool = False # Quantification matrix options top_n: int = 3 @@ -691,3 +702,11 @@ class ExportIOConfig(BaseIOConfig): # SqMass: Export to parquet pqp_file: Optional[str] = None # Path to PQP file for precursor/transition mapping + + # Export to library options + rt_calibration: bool = True + im_calibration: bool = True + intensity_calibration: bool = True + min_fragments: int = 4 + keep_decoys: bool = False # Whether to keep decoy entries in the library + rt_unit: Literal["iRT", "RT"] = "iRT" \ No newline at end of file diff --git a/pyprophet/cli/export.py b/pyprophet/cli/export.py index 9ca6a3e1..2162093c 100644 --- a/pyprophet/cli/export.py +++ b/pyprophet/cli/export.py @@ -37,6 +37,7 @@ def export(): pass export.add_command(export_tsv, name="tsv") + export.add_command(export_library, name='library') export.add_command(export_matrix, name="matrix") export.add_command(export_parquet, name="parquet") export.add_command(export_compound, name="compound") @@ -347,6 +348,131 @@ def export_matrix( df = reader.read() writer.export_quant_matrix(df) +# Export to Library to be used in OpenSWATH +@click.command(name="library", cls=AdvancedHelpCommand) +@click.option( + "--in", + "infile", + required=True, + type=click.Path(exists=True), + help="PyProphet OSW input file.", +) +@click.option( + "--out", + "outfile", + required=True, # need to name the library or else get error in os.path.splittext line 75, in __post_init__in _base. + type=click.Path(exists=False), + help="Output tsv library.", +) +@click.option( + "--max_peakgroup_qvalue", + default=0.01, + show_default=True, + type=float, + help="Filter results to maximum run-specific peak group-level q-value, using values greater than final statistical filtering (in most cases > 0.01), may lead to an overestimation in identification rates. If there are multiple runs with the same precursors, the run with the lowest q value is used", +) +@click.option( + "--max_global_peptide_qvalue", + default=0.01, + show_default=True, + type=float, + help="Filter results to maximum global peptide-level q-value, using values greater than final statistical filtering (in most cases > 0.01), may lead to an overestimation in identification rates." +) +@click.option( + "--max_global_protein_qvalue", + default=0.01, + show_default=True, + type=float, + help="Filter results to maximum global protein-level q-value, using values greater than final statistical filtering (in most cases > 0.01), may lead to an overestimation in identification rates." +) +@click.option( + "--rt_calibration/--no-rt_calibration", + default=True, + show_default=True, + help="Use empirical RT values as oppose to the original library RT values." +) +@click.option( + "--im_calibration/--no-im_calibration", + default=True, + show_default=True, + help="Use empirical IM values as oppose to the original library IM values." +) +@click.option( + "--intensity_calibration/--no-intensity_calibration", + default=True, + show_default=True, + help="Use empirical intensity values as oppose to the original library intensity values." +) +@click.option( + "--min_fragments", + default=4, + show_default=True, + type=int, + help="Minimum number of fragments required to include the peak group in the library, only relevant if intensityCalibration is True." +) +@click.option( + "--keep_decoys/--no-keep_decoys", + default=False, + show_default=True, + type=bool, + help="(Experimental) Whether to keep decoys in the exported library. Default is False, which means decoys are filtered out. Only keeps decoys passing thresholds specified above" +) +@click.option( + "--rt_unit", + default="iRT", + show_default=True, + type=click.Choice(["iRT", "RT"]), + help='Unit of retention time in the library, only relevant if rt_calibration is True. If "iRT" is selected, the retention times will be scaled to the iRT scale (0-100) in the library.', + hidden=True +) +@click.option( + "--test/--no-test", + default=False, + show_default=True, + help="Enable test mode with deterministic behavior, test mode will sort libraries by precursor, fragmentType, fragmentSeriesNumber and fragmentCharge") +@measure_memory_usage_and_time +def export_library( + infile, + outfile, + max_peakgroup_qvalue, + max_global_peptide_qvalue, + max_global_protein_qvalue, + rt_calibration, + im_calibration, + intensity_calibration, + min_fragments, + keep_decoys, + rt_unit, + test +): + """ + Export OSW to tsv library format + """ + config = ExportIOConfig( + infile=infile, + outfile=outfile, + subsample_ratio=1.0, # Not used in export + level="export", + context="export", + export_format="library", + out_type="tsv", + max_rs_peakgroup_qvalue=max_peakgroup_qvalue, + max_global_peptide_qvalue=max_global_peptide_qvalue, + max_global_protein_qvalue=max_global_protein_qvalue, + rt_calibration=rt_calibration, + im_calibration=im_calibration, + intensity_calibration=intensity_calibration, + min_fragments=min_fragments, + keep_decoys=keep_decoys, + rt_unit=rt_unit, + test=test + ) + + reader = ReaderDispatcher.get_reader(config) + writer = WriterDispatcher.get_writer(config) + + df = reader.read() + writer.clean_and_export_library(df) # Export to Parquet @click.command(name="parquet", cls=AdvancedHelpCommand) diff --git a/pyprophet/io/_base.py b/pyprophet/io/_base.py index 89e284d9..23aa3680 100644 --- a/pyprophet/io/_base.py +++ b/pyprophet/io/_base.py @@ -48,6 +48,7 @@ import duckdb import pandas as pd import polars as pl +import sklearn.preprocessing as preprocessing # For MinMaxScaler from loguru import logger from .._base import BaseIOConfig @@ -619,6 +620,62 @@ def export_results(self, data: pd.DataFrame): else: raise ValueError(f"Unsupported export format: {cfg.export_format}") + def clean_and_export_library(self, data: pd.DataFrame) -> pd.DataFrame: + """ + This function cleans the original dataframe and exports the library + + Args: + data: Input DataFrame with library data + + """ + cfg = self.config + + # For precursors found in more than one run, select the run with the smallest q value + # If q values are the same, select the first run + data = data.sort_values(by=['Q_Value', 'Intensity', 'RunId']).groupby("TransitionId").head(1) + assert len(data['TransitionId'].drop_duplicates()) == len(data), "After filtering by Q_Value Intensity and RunId, duplicate transition IDs found." + + # Remove Annotation Column if all NAN + if data['Annotation'].isnull().all() or data['Annotation'].eq("NA").all(): + logger.debug("Annotation column is empty, so computing it manually.") + data.drop(columns=['Annotation'], inplace=True) + data['Annotation'] = data['FragmentType'] + data['FragmentSeriesNumber'].astype(str) + '^' + data['FragmentCharge'].astype(str) + + if cfg.rt_calibration and cfg.rt_unit == "iRT": + data['NormalizedRetentionTime'] = preprocessing.MinMaxScaler().fit_transform(data[['NormalizedRetentionTime']]) * 100 + if cfg.intensity_calibration: + data['LibraryIntensity'] = ( + data['LibraryIntensity'] / + data.groupby('Precursor')['LibraryIntensity'].transform('max') * + 10000) + logger.debug("Removing {} rows with zero intensity.".format(len(data[data['LibraryIntensity'] <= 0]))) + # Remove rows with zero intensity + data = data[data['LibraryIntensity'] > 0] + + ## Print Library statistics + logger.info(f"Library Contains {len(data['Precursor'].drop_duplicates())} Precursors") + + logger.info(f"Precursor Fragment Distribution (Before Filtering)") + num_frags_per_prec = data[['Precursor', 'TransitionId']].groupby("Precursor").count().reset_index(names='Precursor').groupby('TransitionId').count() + for frag, count in num_frags_per_prec.iterrows(): + logger.info(f"There are {count['Precursor']} precursors with {frag} fragment(s)") + + logger.info(f"Filter library to precursors containing {cfg.min_fragments} or more fragments") + ids_to_keep = data[['Precursor', 'Annotation']].groupby('Precursor').count() + ids_to_keep = ids_to_keep[ ids_to_keep['Annotation'] >= cfg.min_fragments ].index + data = data[ data['Precursor'].isin(ids_to_keep) ] + + logger.info(f"After filtering, library contains {len(data['Precursor'].drop_duplicates())} Precursors") + if cfg.keep_decoys: + logger.info("Of Which {} are decoys".format(len(data[data['Decoy'] == 1]['Precursor'].drop_duplicates()))) + + data.drop(columns=['TransitionId', 'Q_Value', 'RunId', 'Intensity'], inplace=True) + if cfg.test: + data = data.sort_values(by=['Precursor', 'FragmentType', 'FragmentSeriesNumber', 'FragmentCharge', 'ProductMz']) + + logger.info("Exporting library to file.") + data.to_csv(cfg.outfile, sep='\t', index=False) + def export_quant_matrix(self, data: pd.DataFrame) -> pd.DataFrame: """ Export quantification matrix at specified level with optional normalization. diff --git a/pyprophet/io/export/osw.py b/pyprophet/io/export/osw.py index 02d1669d..d4520930 100644 --- a/pyprophet/io/export/osw.py +++ b/pyprophet/io/export/osw.py @@ -123,6 +123,9 @@ def _read_sqlite(self, con): """Main entry point for reading SQLite data, delegates to specific methods.""" cfg = self.config + if self.config.export_format == "library": + raise NotImplementedError("Library export from sqlite OSW files is not supported") + if self._is_unscored_file(con): logger.info("Reading unscored data from Parquet file.") return self._read_unscored_data(con) diff --git a/pyprophet/io/export/parquet.py b/pyprophet/io/export/parquet.py index 16026cdd..556abdd4 100644 --- a/pyprophet/io/export/parquet.py +++ b/pyprophet/io/export/parquet.py @@ -36,6 +36,9 @@ def read(self) -> pd.DataFrame: try: self._init_duckdb_views(con) + if self.config.export_format == "library": + raise NotImplementedError("Library export from non-split .parquet files is not supported") + if self._is_unscored_file(): logger.info("Reading unscored data from Parquet file.") return self._read_unscored_data(con) diff --git a/pyprophet/io/export/split_parquet.py b/pyprophet/io/export/split_parquet.py index 6137d533..265130a8 100644 --- a/pyprophet/io/export/split_parquet.py +++ b/pyprophet/io/export/split_parquet.py @@ -68,6 +68,18 @@ def read(self) -> pd.DataFrame: try: self._init_duckdb_views(con) + if self.config.export_format == "library": + if self._is_unscored_file(): + descr= "Files must be scored for library generation." + logger.exception(descr) + raise ValueError(descr) + if not self._has_peptide_protein_global_scores(): + descr= "Files must have peptide and protein level global scores for library generation." + logger.exception(descr) + raise ValueError(descr) + logger.info("Reading standard OpenSWATH data for library from split Parquet files.") + return self._read_library_data(con) + if self._is_unscored_file(): logger.info("Reading unscored data from split Parquet files.") return self._read_unscored_data(con) @@ -82,9 +94,17 @@ def read(self) -> pd.DataFrame: logger.info("Reading standard OpenSWATH data from split Parquet files.") data = self._read_standard_data(con) - return self._augment_data(data, con) + return self._augment_data(data, con) finally: con.close() + + def _has_peptide_protein_global_scores(self) -> bool: + """ + Check if files contain peptide and protein global scores + """ + has_peptide = any(col.startswith("SCORE_PEPTIDE_GLOBAL") for col in self._columns) + has_protein = any(col.startswith("SCORE_PROTEIN_GLOBAL") for col in self._columns) + return has_peptide and has_protein def _is_unscored_file(self) -> bool: """ @@ -257,6 +277,66 @@ def _read_augmented_data(self, con) -> pd.DataFrame: return pd.merge(data, ipf_data, on="id", how="left") + def _read_library_data(self, con) -> pd.DataFrame: + """ + Read data specifically for precursors for library generation. This does not include all output in standard output + """ + if self.config.rt_calibration: + rt_col = "p.EXP_RT" + else: + rt_col = "p.PRECURSOR_LIBRARY_RT" + + if self.config.im_calibration: + im_col = "p.EXP_IM" + else: + im_col = "p.PRECURSOR_LIBRARY_DRIFT_TIME" + + if self.config.intensity_calibration: + intensity_col = 't.FEATURE_TRANSITION_AREA_INTENSITY' + else: + intensity_col = 't.TRANSITION_LIBRARY_INTENSITY' + + if self.config.keep_decoys: + decoy_query = "" + else: + decoy_query ="p.PRECURSOR_DECOY is false and t.TRANSITION_DECOY is false and" + + query = f""" + SELECT + {rt_col} as NormalizedRetentionTime, + {im_col} as PrecursorIonMobility, + {intensity_col} as LibraryIntensity, + p.SCORE_MS2_Q_VALUE as Q_Value, + p.UNMODIFIED_SEQUENCE AS PeptideSequence, + p.MODIFIED_SEQUENCE AS ModifiedPeptideSequence, + p.PRECURSOR_CHARGE AS PrecursorCharge, + p.FEATURE_MS2_AREA_INTENSITY AS Intensity, + p.RUN_ID AS RunId, + (p.MODIFIED_SEQUENCE || '_' || CAST(p.PRECURSOR_CHARGE AS VARCHAR)) AS Precursor, + p.PRECURSOR_MZ AS PrecursorMz, + STRING_AGG(p.PROTEIN_ACCESSION, ';') AS ProteinName, + t.ANNOTATION as Annotation, + t.PRODUCT_MZ as ProductMz, + t.TRANSITION_CHARGE as FragmentCharge, + t.TRANSITION_TYPE as FragmentType, + t.TRANSITION_ORDINAL as FragmentSeriesNumber, + t.TRANSITION_ID as TransitionId, + p.PRECURSOR_DECOY as Decoy + FROM precursors p + INNER JOIN transition t ON p.FEATURE_ID = t.FEATURE_ID + WHERE {decoy_query} + p.SCORE_MS2_Q_VALUE < {self.config.max_rs_peakgroup_qvalue} and + p.SCORE_PROTEIN_GLOBAL_Q_VALUE < {self.config.max_global_protein_qvalue} and + p.SCORE_PEPTIDE_GLOBAL_Q_VALUE < {self.config.max_global_peptide_qvalue} and + p.SCORE_MS2_PEAK_GROUP_RANK = 1 + + GROUP BY {rt_col}, {im_col}, {intensity_col}, p.SCORE_MS2_Q_VALUE, + p.UNMODIFIED_SEQUENCE, p.MODIFIED_SEQUENCE, p.PRECURSOR_CHARGE, + p.PRECURSOR_MZ, p.FEATURE_ID, t.ANNOTATION, t.PRODUCT_MZ, + t.TRANSITION_CHARGE, t.TRANSITION_TYPE, t.TRANSITION_ORDINAL, t.TRANSITION_ID, p.PRECURSOR_DECOY, p.RUN_ID, p.FEATURE_MS2_AREA_INTENSITY + """ + return con.execute(query).fetchdf() + def _read_standard_data(self, con) -> pd.DataFrame: """ Read standard OpenSWATH data without IPF from split files. diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-False-RT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-False-RT].out new file mode 100644 index 00000000..e69de29b diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-False-iRT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-False-iRT].out new file mode 100644 index 00000000..e69de29b diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-True-RT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-True-RT].out new file mode 100644 index 00000000..e69de29b diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-True-iRT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-True-iRT].out new file mode 100644 index 00000000..e69de29b diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-False-RT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-False-RT].out new file mode 100644 index 00000000..e69de29b diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-False-iRT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-False-iRT].out new file mode 100644 index 00000000..e69de29b diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-True-RT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-True-RT].out new file mode 100644 index 00000000..e69de29b diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-True-iRT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-True-iRT].out new file mode 100644 index 00000000..e69de29b diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False-RT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False-RT].out new file mode 100644 index 00000000..37c4cd9b --- /dev/null +++ b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False-RT].out @@ -0,0 +1,14 @@ + Annotation Decoy FragmentCharge FragmentSeriesNumber FragmentType LibraryIntensity ModifiedPeptideSequence NormalizedRetentionTime PeptideSequence Precursor PrecursorCharge PrecursorIonMobility PrecursorMz ProductMz ProteinName +0 -1^1 0 1 -1 NaN -1.0 AAEDFTLLVK(UniMod:259) 58.9 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 557.8153 AQUA4SWATH_HMLangeE +1 b4^1 0 1 4 b -1.0 AAEDFTLLVK(UniMod:259) 58.9 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 387.1510 AQUA4SWATH_HMLangeE +2 b5^1 0 1 5 b -1.0 AAEDFTLLVK(UniMod:259) 58.9 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 534.2195 AQUA4SWATH_HMLangeE +3 b6^1 0 1 6 b -1.0 AAEDFTLLVK(UniMod:259) 58.9 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 635.2671 AQUA4SWATH_HMLangeE +4 b7^1 0 1 7 b -1.0 AAEDFTLLVK(UniMod:259) 58.9 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 748.3512 AQUA4SWATH_HMLangeE +.. ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... +95 b4^1 0 1 4 b -1.0 AASEIATAELAPTHPIR(UniMod:267) 31.5 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 359.1561 AQUA4SWATH_PombeSchmidt +96 b5^1 0 1 5 b -1.0 AASEIATAELAPTHPIR(UniMod:267) 31.5 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 472.2402 AQUA4SWATH_PombeSchmidt +97 b6^1 0 1 6 b -1.0 AASEIATAELAPTHPIR(UniMod:267) 31.5 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 543.2773 AQUA4SWATH_PombeSchmidt +98 b7^1 0 1 7 b -1.0 AASEIATAELAPTHPIR(UniMod:267) 31.5 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 644.3250 AQUA4SWATH_PombeSchmidt +99 b8^1 0 1 8 b -1.0 AASEIATAELAPTHPIR(UniMod:267) 31.5 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 715.3621 AQUA4SWATH_PombeSchmidt + +[100 rows x 15 columns] diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False-iRT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False-iRT].out new file mode 100644 index 00000000..37c4cd9b --- /dev/null +++ b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False-iRT].out @@ -0,0 +1,14 @@ + Annotation Decoy FragmentCharge FragmentSeriesNumber FragmentType LibraryIntensity ModifiedPeptideSequence NormalizedRetentionTime PeptideSequence Precursor PrecursorCharge PrecursorIonMobility PrecursorMz ProductMz ProteinName +0 -1^1 0 1 -1 NaN -1.0 AAEDFTLLVK(UniMod:259) 58.9 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 557.8153 AQUA4SWATH_HMLangeE +1 b4^1 0 1 4 b -1.0 AAEDFTLLVK(UniMod:259) 58.9 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 387.1510 AQUA4SWATH_HMLangeE +2 b5^1 0 1 5 b -1.0 AAEDFTLLVK(UniMod:259) 58.9 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 534.2195 AQUA4SWATH_HMLangeE +3 b6^1 0 1 6 b -1.0 AAEDFTLLVK(UniMod:259) 58.9 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 635.2671 AQUA4SWATH_HMLangeE +4 b7^1 0 1 7 b -1.0 AAEDFTLLVK(UniMod:259) 58.9 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 748.3512 AQUA4SWATH_HMLangeE +.. ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... +95 b4^1 0 1 4 b -1.0 AASEIATAELAPTHPIR(UniMod:267) 31.5 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 359.1561 AQUA4SWATH_PombeSchmidt +96 b5^1 0 1 5 b -1.0 AASEIATAELAPTHPIR(UniMod:267) 31.5 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 472.2402 AQUA4SWATH_PombeSchmidt +97 b6^1 0 1 6 b -1.0 AASEIATAELAPTHPIR(UniMod:267) 31.5 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 543.2773 AQUA4SWATH_PombeSchmidt +98 b7^1 0 1 7 b -1.0 AASEIATAELAPTHPIR(UniMod:267) 31.5 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 644.3250 AQUA4SWATH_PombeSchmidt +99 b8^1 0 1 8 b -1.0 AASEIATAELAPTHPIR(UniMod:267) 31.5 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 715.3621 AQUA4SWATH_PombeSchmidt + +[100 rows x 15 columns] diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True-RT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True-RT].out new file mode 100644 index 00000000..5d75e9c9 --- /dev/null +++ b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True-RT].out @@ -0,0 +1,14 @@ + Annotation Decoy FragmentCharge FragmentSeriesNumber FragmentType LibraryIntensity ModifiedPeptideSequence NormalizedRetentionTime PeptideSequence Precursor PrecursorCharge PrecursorIonMobility PrecursorMz ProductMz ProteinName +0 -1^1 0 1 -1 NaN 10000.0000 AAEDFTLLVK(UniMod:259) 3665.82 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 557.8153 AQUA4SWATH_HMLangeE +1 b4^1 0 1 4 b 1912.5839 AAEDFTLLVK(UniMod:259) 3665.82 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 387.1510 AQUA4SWATH_HMLangeE +2 b5^1 0 1 5 b 910.8388 AAEDFTLLVK(UniMod:259) 3665.82 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 534.2195 AQUA4SWATH_HMLangeE +3 b6^1 0 1 6 b 763.6335 AAEDFTLLVK(UniMod:259) 3665.82 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 635.2671 AQUA4SWATH_HMLangeE +4 b7^1 0 1 7 b 619.6819 AAEDFTLLVK(UniMod:259) 3665.82 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 748.3512 AQUA4SWATH_HMLangeE +.. ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... +95 b4^1 0 1 4 b 4293.9906 AASEIATAELAPTHPIR(UniMod:267) 2754.99 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 359.1561 AQUA4SWATH_PombeSchmidt +96 b5^1 0 1 5 b 2245.5035 AASEIATAELAPTHPIR(UniMod:267) 2754.99 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 472.2402 AQUA4SWATH_PombeSchmidt +97 b6^1 0 1 6 b 1169.3817 AASEIATAELAPTHPIR(UniMod:267) 2754.99 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 543.2773 AQUA4SWATH_PombeSchmidt +98 b7^1 0 1 7 b 796.7460 AASEIATAELAPTHPIR(UniMod:267) 2754.99 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 644.3250 AQUA4SWATH_PombeSchmidt +99 b8^1 0 1 8 b 616.6858 AASEIATAELAPTHPIR(UniMod:267) 2754.99 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 715.3621 AQUA4SWATH_PombeSchmidt + +[100 rows x 15 columns] diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True-iRT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True-iRT].out new file mode 100644 index 00000000..8f3d8b08 --- /dev/null +++ b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True-iRT].out @@ -0,0 +1,14 @@ + Annotation Decoy FragmentCharge FragmentSeriesNumber FragmentType LibraryIntensity ModifiedPeptideSequence NormalizedRetentionTime PeptideSequence Precursor PrecursorCharge PrecursorIonMobility PrecursorMz ProductMz ProteinName +0 -1^1 0 1 -1 NaN 10000.0000 AAEDFTLLVK(UniMod:259) 62.4638 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 557.8153 AQUA4SWATH_HMLangeE +1 b4^1 0 1 4 b 1912.5839 AAEDFTLLVK(UniMod:259) 62.4638 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 387.1510 AQUA4SWATH_HMLangeE +2 b5^1 0 1 5 b 910.8388 AAEDFTLLVK(UniMod:259) 62.4638 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 534.2195 AQUA4SWATH_HMLangeE +3 b6^1 0 1 6 b 763.6335 AAEDFTLLVK(UniMod:259) 62.4638 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 635.2671 AQUA4SWATH_HMLangeE +4 b7^1 0 1 7 b 619.6819 AAEDFTLLVK(UniMod:259) 62.4638 AAEDFTLLVK AAEDFTLLVK(UniMod:259)_2 2 NaN 557.8153 748.3512 AQUA4SWATH_HMLangeE +.. ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... +95 b4^1 0 1 4 b 4293.9906 AASEIATAELAPTHPIR(UniMod:267) 45.3495 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 359.1561 AQUA4SWATH_PombeSchmidt +96 b5^1 0 1 5 b 2245.5035 AASEIATAELAPTHPIR(UniMod:267) 45.3495 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 472.2402 AQUA4SWATH_PombeSchmidt +97 b6^1 0 1 6 b 1169.3817 AASEIATAELAPTHPIR(UniMod:267) 45.3495 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 543.2773 AQUA4SWATH_PombeSchmidt +98 b7^1 0 1 7 b 796.7460 AASEIATAELAPTHPIR(UniMod:267) 45.3495 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 644.3250 AQUA4SWATH_PombeSchmidt +99 b8^1 0 1 8 b 616.6858 AASEIATAELAPTHPIR(UniMod:267) 45.3495 AASEIATAELAPTHPIR AASEIATAELAPTHPIR(UniMod:267)_2 2 NaN 879.4746 715.3621 AQUA4SWATH_PombeSchmidt + +[100 rows x 15 columns] diff --git a/tests/test_pyprophet_export.py b/tests/test_pyprophet_export.py index 19e0bece..6e05f8c3 100644 --- a/tests/test_pyprophet_export.py +++ b/tests/test_pyprophet_export.py @@ -91,7 +91,10 @@ def run_pyprophet_command(cmd, temp_folder): ).decode() except subprocess.CalledProcessError as error: print(f"Command failed: {cmd}\n{error.output.decode()}", file=sys.stderr) - raise + if "NotImplementedError" in error.output.decode(): # attempt to catch the specific error rather than the CalledProcessError + raise NotImplementedError + else: + raise def validate_export_results( @@ -147,6 +150,38 @@ def test_osw_analysis( f"{temp_folder}/test_data.tsv", ) +@pytest.mark.parametrize( + "calib, rt_unit", + [ (True, 'iRT'), (False, 'iRT'), (True, 'RT'), (False, 'RT')] +) +def test_osw_analysis_libExport(input_strategy, temp_folder, regtest, calib, rt_unit +): + cmd = f"pyprophet score {input_strategy['cmd_prefix']} --level=ms2 --test --pi0_lambda=0.001 0 0 --ss_iteration_fdr=0.02 && " + + # peptide-level + cmd += f"pyprophet infer peptide --pi0_lambda=0.001 0 0 {input_strategy['cmd_prefix']} --context=global && " + + # protein-level + cmd += f"pyprophet infer protein --pi0_lambda=0 0 0 {input_strategy['cmd_prefix']} --context=global && " + + + # export + if calib: + cmd += f"pyprophet export library {input_strategy['cmd_prefix']} --out={temp_folder}/test_lib.tsv --test --max_peakgroup_qvalue=1 --max_global_peptide_qvalue=1 --max_global_protein_qvalue=1 --rt_unit={rt_unit}" + else: + cmd += f"pyprophet export library {input_strategy['cmd_prefix']} --out={temp_folder}/test_lib.tsv --test --max_peakgroup_qvalue=1 --max_global_peptide_qvalue=1 --max_global_protein_qvalue=1 --no-rt_calibration --no-im_calibration --no-intensity_calibration --rt_unit={rt_unit}" + + if not input_strategy["reader"] == "parquet_split": + with pytest.raises(NotImplementedError): + run_pyprophet_command(cmd, temp_folder) + else: + run_pyprophet_command(cmd, temp_folder) + validate_export_results( + regtest, + input_strategy["path"], + input_strategy["reader"], + f"{temp_folder}/test_lib.tsv", + ) def test_osw_unscored(input_strategy, temp_folder, regtest): """Test export of unscored OSW data"""