PyProphet · singjc · Aug 28, 2025 · Jun 17, 2025 · Jun 18, 2025 · Jul 9, 2025
diff --git a/pyprophet/_config.py b/pyprophet/_config.py
@@ -635,6 +635,7 @@ class ExportIOConfig(BaseIOConfig):
             - "legacy_split": Split TSV files for each run.
             - "parquet": Single Parquet file with merged results.
             - "parquet_split": Split Parquet files for each run.
+            - "library" : .tsv library file
         out_type (Literal["tsv", "csv"]): Output file type for exported results.
         transition_quantification (bool): Report aggregated transition-level quantification.
         max_transition_pep (float): Maximum PEP to retain scored transitions for quantification (requires transition-level scoring).
@@ -653,6 +654,7 @@ class ExportIOConfig(BaseIOConfig):
         top_n (int): Number of top intense features to use for summarization
         consistent_top (bool): Whether to use same top features across all runs
         normalization (Literal["none", "median", "medianmedian", "quantile"]): Normalization method
+        test: bool = False: Whether to enable test mode with deterministic behavior, test mode will sort libraries by precursor, fragmentType, fragmentSeriesNumber and fragmentCharge
 
         # OSW: Export to parquet
         compression_method (Literal["none", "snappy", "gzip", "brotli", "zstd"]): Compression method for parquet files.
@@ -662,10 +664,18 @@ class ExportIOConfig(BaseIOConfig):
 
         # SqMass: Export to parquet
         pqp_file (Optional[str]): Path to PQP file for precursor/transition mapping.
+
+        # Export to library
+        rt_calibration (bool): If True, will use emperical RT values as oppose to the original library RT values
+        im_calibration (bool): If True, will use emperical IM values as oppose to the original library IM values
+        intensity_calibration (bool): If True, will use emperical intensity values as oppose to the original library intensity values
+        min_fragments (int): Minimum number of fragments required to include the peak group in the library, only relevant if intensity_calibration is True
+        keep_decoys (bool): Whether to keep decoy entries in the library, will only keep decoys that pass the thresholds specified
+        rt_unit (Literal["iRT", "RT"], default = 'iRT') = "iRT": Unit of retention time in the library, only relevant if rt_calibration is True. If "iRT" is selected, the retention times will be scaled to the iRT scale (0-100) in the library
     """
 
     export_format: Literal[
-        "matrix", "legacy_merged", "legacy_split", "parquet", "parquet_split"
+        "matrix", "legacy_merged", "legacy_split", "parquet", "parquet_split", "library"
     ] = "legacy_merged"
     out_type: Literal["tsv", "csv"] = "tsv"
     transition_quantification: bool = False
@@ -677,6 +687,7 @@ class ExportIOConfig(BaseIOConfig):
     max_global_peptide_qvalue: float = 0.01
     protein: bool = True
     max_global_protein_qvalue: float = 0.01
+    test: bool = False
 
     # Quantification matrix options
     top_n: int = 3
@@ -691,3 +702,11 @@ class ExportIOConfig(BaseIOConfig):
 
     # SqMass: Export to parquet
     pqp_file: Optional[str] = None  # Path to PQP file for precursor/transition mapping
+
+    # Export to library options
+    rt_calibration: bool = True
+    im_calibration: bool = True
+    intensity_calibration: bool = True
+    min_fragments: int = 4
+    keep_decoys: bool = False  # Whether to keep decoy entries in the library
+    rt_unit: Literal["iRT", "RT"] = "iRT"
diff --git a/pyprophet/cli/export.py b/pyprophet/cli/export.py
@@ -37,6 +37,7 @@ def export():
         pass
 
     export.add_command(export_tsv, name="tsv")
+    export.add_command(export_library, name='library')
     export.add_command(export_matrix, name="matrix")
     export.add_command(export_parquet, name="parquet")
     export.add_command(export_compound, name="compound")
@@ -347,6 +348,131 @@ def export_matrix(
     df = reader.read()
     writer.export_quant_matrix(df)
 
+# Export to Library to be used in OpenSWATH
+@click.command(name="library", cls=AdvancedHelpCommand)
+@click.option(
+    "--in",
+    "infile",
+    required=True,
+    type=click.Path(exists=True),
+    help="PyProphet OSW input file.",
+)
+@click.option(
+    "--out",
+    "outfile",
+    required=True, # need to name the library or else get error in os.path.splittext line 75, in __post_init__in _base.
+    type=click.Path(exists=False),
+    help="Output tsv library.",
+)
+@click.option(
+    "--max_peakgroup_qvalue",
+    default=0.01,
+    show_default=True,
+    type=float,
+    help="Filter results to maximum run-specific peak group-level q-value, using values greater than final statistical filtering (in most cases > 0.01), may lead to an overestimation in identification rates. If there are multiple runs with the same precursors, the run with the lowest q value is used",
+)
+@click.option(
+    "--max_global_peptide_qvalue",
+    default=0.01,
+    show_default=True,
+    type=float,
+    help="Filter results to maximum global peptide-level q-value, using values greater than final statistical filtering (in most cases > 0.01), may lead to an overestimation in identification rates."
+)
+@click.option(
+    "--max_global_protein_qvalue",
+    default=0.01,
+    show_default=True,
+    type=float,
+    help="Filter results to maximum global protein-level q-value, using values greater than final statistical filtering (in most cases > 0.01), may lead to an overestimation in identification rates."
+)
+@click.option(
+    "--rt_calibration/--no-rt_calibration",
+    default=True,
+    show_default=True,
+    help="Use empirical RT values as oppose to the original library RT values."
+)
+@click.option(
+    "--im_calibration/--no-im_calibration",
+    default=True,
+    show_default=True,
+    help="Use empirical IM values as oppose to the original library IM values."
+)
+@click.option(
+    "--intensity_calibration/--no-intensity_calibration",
+    default=True,
+    show_default=True,
+    help="Use empirical intensity values as oppose to the original library intensity values."
+)
+@click.option(
+    "--min_fragments",
+    default=4,
+    show_default=True,
+    type=int,
+    help="Minimum number of fragments required to include the peak group in the library, only relevant if intensityCalibration is True."
+)
+@click.option(
+    "--keep_decoys/--no-keep_decoys",
+    default=False,
+    show_default=True,
+    type=bool,
+    help="(Experimental) Whether to keep decoys in the exported library. Default is False, which means decoys are filtered out. Only keeps decoys passing thresholds specified above"
+)
+@click.option(
+    "--rt_unit",
+    default="iRT",
+    show_default=True,
+    type=click.Choice(["iRT", "RT"]),
+    help='Unit of retention time in the library, only relevant if rt_calibration is True. If "iRT" is selected, the retention times will be scaled to the iRT scale (0-100) in the library.',
+    hidden=True
+)
+@click.option(
+    "--test/--no-test",
+    default=False,
+    show_default=True,
+    help="Enable test mode with deterministic behavior, test mode will sort libraries by precursor, fragmentType, fragmentSeriesNumber and fragmentCharge")
+@measure_memory_usage_and_time
+def export_library(
+    infile,
+    outfile,
+    max_peakgroup_qvalue,
+    max_global_peptide_qvalue,
+    max_global_protein_qvalue,
+    rt_calibration,
+    im_calibration,
+    intensity_calibration,
+    min_fragments,
+    keep_decoys,
+    rt_unit,
+    test
+):
+    """
+    Export OSW to tsv library format
+    """
+    config = ExportIOConfig(
+        infile=infile,
+        outfile=outfile,
+        subsample_ratio=1.0,  # Not used in export
+        level="export",
+        context="export",
+        export_format="library",
+        out_type="tsv",
+        max_rs_peakgroup_qvalue=max_peakgroup_qvalue,
+        max_global_peptide_qvalue=max_global_peptide_qvalue,
+        max_global_protein_qvalue=max_global_protein_qvalue,
+        rt_calibration=rt_calibration,
+        im_calibration=im_calibration,
+        intensity_calibration=intensity_calibration,
+        min_fragments=min_fragments,
+        keep_decoys=keep_decoys,
+        rt_unit=rt_unit,
+        test=test
+    )
+
+    reader = ReaderDispatcher.get_reader(config)
+    writer = WriterDispatcher.get_writer(config)
+
+    df = reader.read()
+    writer.clean_and_export_library(df)
 
 # Export to Parquet
 @click.command(name="parquet", cls=AdvancedHelpCommand)

diff --git a/pyprophet/io/_base.py b/pyprophet/io/_base.py
@@ -48,6 +48,7 @@
 import duckdb
 import pandas as pd
 import polars as pl
+import sklearn.preprocessing as preprocessing # For MinMaxScaler
-import sklearn.preprocessing as preprocessing # For MinMaxScaler
+import sklearn.preprocessing as preprocessing  # For MinMaxScaler
-import sklearn.preprocessing as preprocessing # For MinMaxScaler
+import sklearn.preprocessing as preprocessing  # For MinMaxScaler
 from loguru import logger
 
 from .._base import BaseIOConfig
@@ -619,6 +620,62 @@ def export_results(self, data: pd.DataFrame):
         else:
             raise ValueError(f"Unsupported export format: {cfg.export_format}")
 
+    def clean_and_export_library(self, data: pd.DataFrame) -> pd.DataFrame:
+        """
+        This function cleans the original dataframe and exports the library
+
+        Args:
+            data: Input DataFrame with library data
+
+        """
+        cfg = self.config
+
+        # For precursors found in more than one run, select the run with the smallest q value
+        # If q values are the same, select the first run
+        data = data.sort_values(by=['Q_Value', 'Intensity', 'RunId']).groupby("TransitionId").head(1)
+        assert len(data['TransitionId'].drop_duplicates()) == len(data), "After filtering by Q_Value Intensity and RunId, duplicate transition IDs found."
+
+        # Remove Annotation Column if all NAN
+        if data['Annotation'].isnull().all() or data['Annotation'].eq("NA").all():
+            logger.debug("Annotation column is empty, so computing it manually.")
+            data.drop(columns=['Annotation'], inplace=True)
+            data['Annotation'] = data['FragmentType'] + data['FragmentSeriesNumber'].astype(str) + '^' + data['FragmentCharge'].astype(str)
+
+        if cfg.rt_calibration and cfg.rt_unit == "iRT":
+            data['NormalizedRetentionTime'] = preprocessing.MinMaxScaler().fit_transform(data[['NormalizedRetentionTime']]) * 100
+        if cfg.intensity_calibration:
+            data['LibraryIntensity'] = (
+            data['LibraryIntensity'] /
+            data.groupby('Precursor')['LibraryIntensity'].transform('max') *
+            10000)
+            logger.debug("Removing {} rows with zero intensity.".format(len(data[data['LibraryIntensity'] <= 0])))
+            # Remove rows with zero intensity
+            data = data[data['LibraryIntensity'] > 0] 
+
+        ## Print Library statistics
+        logger.info(f"Library Contains {len(data['Precursor'].drop_duplicates())} Precursors")
+
+        logger.info(f"Precursor Fragment Distribution (Before Filtering)")
+        num_frags_per_prec = data[['Precursor', 'TransitionId']].groupby("Precursor").count().reset_index(names='Precursor').groupby('TransitionId').count()
-        num_frags_per_prec = data[['Precursor', 'TransitionId']].groupby("Precursor").count().reset_index(names='Precursor').groupby('TransitionId').count()
+        precursor_transition = data[['Precursor', 'TransitionId']]
+        precursor_counts = precursor_transition.groupby("Precursor").count()
+        precursor_counts_reset = precursor_counts.reset_index(names='Precursor')
+        num_frags_per_prec = precursor_counts_reset.groupby('TransitionId').count()
-        num_frags_per_prec = data[['Precursor', 'TransitionId']].groupby("Precursor").count().reset_index(names='Precursor').groupby('TransitionId').count()
+        precursor_transition = data[['Precursor', 'TransitionId']]
+        precursor_counts = precursor_transition.groupby("Precursor").count()
+        precursor_counts_reset = precursor_counts.reset_index(names='Precursor')
+        num_frags_per_prec = precursor_counts_reset.groupby('TransitionId').count()
+        for frag, count in num_frags_per_prec.iterrows():
+            logger.info(f"There are {count['Precursor']} precursors with {frag} fragment(s)")
+
+        logger.info(f"Filter library to precursors containing {cfg.min_fragments} or more fragments")
+        ids_to_keep = data[['Precursor', 'Annotation']].groupby('Precursor').count()
+        ids_to_keep = ids_to_keep[ ids_to_keep['Annotation'] >= cfg.min_fragments ].index
+        data = data[ data['Precursor'].isin(ids_to_keep) ]
+
+        logger.info(f"After filtering, library contains {len(data['Precursor'].drop_duplicates())} Precursors")
+        if cfg.keep_decoys:
+            logger.info("Of Which {} are decoys".format(len(data[data['Decoy'] == 1]['Precursor'].drop_duplicates())))
-            logger.info("Of Which {} are decoys".format(len(data[data['Decoy'] == 1]['Precursor'].drop_duplicates())))
+            logger.info(f"Of Which {len(data[data['Decoy'] == 1]['Precursor'].drop_duplicates())} are decoys")
-            logger.info("Of Which {} are decoys".format(len(data[data['Decoy'] == 1]['Precursor'].drop_duplicates())))
+            logger.info(f"Of Which {len(data[data['Decoy'] == 1]['Precursor'].drop_duplicates())} are decoys")
+
+        data.drop(columns=['TransitionId', 'Q_Value', 'RunId', 'Intensity'], inplace=True)
+        if cfg.test:
+            data = data.sort_values(by=['Precursor', 'FragmentType', 'FragmentSeriesNumber', 'FragmentCharge', 'ProductMz'])
+
+        logger.info("Exporting library to file.")
+        data.to_csv(cfg.outfile, sep='\t', index=False)
+
     def export_quant_matrix(self, data: pd.DataFrame) -> pd.DataFrame:
         """
         Export quantification matrix at specified level with optional normalization.

diff --git a/pyprophet/io/export/osw.py b/pyprophet/io/export/osw.py
@@ -123,6 +123,9 @@ def _read_sqlite(self, con):
         """Main entry point for reading SQLite data, delegates to specific methods."""
         cfg = self.config
 
+        if self.config.export_format == "library":
+            raise NotImplementedError("Library export from sqlite OSW files is not supported")
+
         if self._is_unscored_file(con):
             logger.info("Reading unscored data from Parquet file.")
             return self._read_unscored_data(con)

diff --git a/pyprophet/io/export/parquet.py b/pyprophet/io/export/parquet.py
@@ -36,6 +36,9 @@ def read(self) -> pd.DataFrame:
         try:
             self._init_duckdb_views(con)
 
+            if self.config.export_format == "library":
+                raise NotImplementedError("Library export from non-split .parquet files is not supported")
+
             if self._is_unscored_file():
                 logger.info("Reading unscored data from Parquet file.")
                 return self._read_unscored_data(con)

diff --git a/pyprophet/io/export/split_parquet.py b/pyprophet/io/export/split_parquet.py
@@ -68,6 +68,18 @@ def read(self) -> pd.DataFrame:
         try:
             self._init_duckdb_views(con)
 
+            if self.config.export_format == "library":
+                if self._is_unscored_file():
+                    descr= "Files must be scored for library generation."
+                    logger.exception(descr)
+                    raise ValueError(descr)
+                if not self._has_peptide_protein_global_scores():
+                    descr= "Files must have peptide and protein level global scores for library generation."
+                    logger.exception(descr)
+                    raise ValueError(descr)
+                logger.info("Reading standard OpenSWATH data for library from split Parquet files.")
+                return self._read_library_data(con)
+
             if self._is_unscored_file():
                 logger.info("Reading unscored data from split Parquet files.")
                 return self._read_unscored_data(con)
@@ -82,9 +94,17 @@ def read(self) -> pd.DataFrame:
                 logger.info("Reading standard OpenSWATH data from split Parquet files.")
                 data = self._read_standard_data(con)
 
-            return self._augment_data(data, con)
+                return self._augment_data(data, con)
         finally:
             con.close()
+
+    def _has_peptide_protein_global_scores(self) -> bool:
+        """
+        Check if files contain peptide and protein global scores
+        """
+        has_peptide = any(col.startswith("SCORE_PEPTIDE_GLOBAL") for col in self._columns)
+        has_protein = any(col.startswith("SCORE_PROTEIN_GLOBAL") for col in self._columns)
+        return has_peptide and has_protein
 
     def _is_unscored_file(self) -> bool:
         """
@@ -257,6 +277,66 @@ def _read_augmented_data(self, con) -> pd.DataFrame:
 
         return pd.merge(data, ipf_data, on="id", how="left")
 
+    def _read_library_data(self, con) -> pd.DataFrame:
+        """
+        Read data specifically for precursors for library generation. This does not include all output in standard output
+        """
+        if self.config.rt_calibration:
+            rt_col = "p.EXP_RT"
+        else:
+            rt_col = "p.PRECURSOR_LIBRARY_RT"
+
+        if self.config.im_calibration:
+            im_col = "p.EXP_IM"
+        else:
+            im_col = "p.PRECURSOR_LIBRARY_DRIFT_TIME"
+
+        if self.config.intensity_calibration:
+            intensity_col = 't.FEATURE_TRANSITION_AREA_INTENSITY'
+        else:
+            intensity_col = 't.TRANSITION_LIBRARY_INTENSITY'
+
+        if self.config.keep_decoys:
+            decoy_query = ""
+        else:
+            decoy_query ="p.PRECURSOR_DECOY is false and t.TRANSITION_DECOY is false and" 
+
+        query = f"""
+            SELECT
+                {rt_col} as NormalizedRetentionTime,
+                {im_col} as PrecursorIonMobility,
+                {intensity_col} as LibraryIntensity,
+                p.SCORE_MS2_Q_VALUE as Q_Value,
+                p.UNMODIFIED_SEQUENCE AS PeptideSequence,
+                p.MODIFIED_SEQUENCE AS ModifiedPeptideSequence,
+                p.PRECURSOR_CHARGE AS PrecursorCharge,
+                p.FEATURE_MS2_AREA_INTENSITY AS Intensity,
+                p.RUN_ID AS RunId,
+                (p.MODIFIED_SEQUENCE || '_' || CAST(p.PRECURSOR_CHARGE AS VARCHAR)) AS Precursor,
+                p.PRECURSOR_MZ AS PrecursorMz,
+                STRING_AGG(p.PROTEIN_ACCESSION, ';') AS ProteinName,
+                t.ANNOTATION as Annotation,
+                t.PRODUCT_MZ as ProductMz,
+                t.TRANSITION_CHARGE as FragmentCharge,
+                t.TRANSITION_TYPE as FragmentType,
+                t.TRANSITION_ORDINAL as FragmentSeriesNumber,
+                t.TRANSITION_ID as TransitionId,
+                p.PRECURSOR_DECOY as Decoy
+            FROM precursors p
+            INNER JOIN transition t ON p.FEATURE_ID = t.FEATURE_ID
+            WHERE {decoy_query} 
+                  p.SCORE_MS2_Q_VALUE < {self.config.max_rs_peakgroup_qvalue} and
+                  p.SCORE_PROTEIN_GLOBAL_Q_VALUE < {self.config.max_global_protein_qvalue} and
+                  p.SCORE_PEPTIDE_GLOBAL_Q_VALUE < {self.config.max_global_peptide_qvalue} and
+                  p.SCORE_MS2_PEAK_GROUP_RANK = 1
+
+            GROUP BY {rt_col}, {im_col}, {intensity_col}, p.SCORE_MS2_Q_VALUE,
+                     p.UNMODIFIED_SEQUENCE, p.MODIFIED_SEQUENCE, p.PRECURSOR_CHARGE,
+                     p.PRECURSOR_MZ, p.FEATURE_ID, t.ANNOTATION, t.PRODUCT_MZ,
+                     t.TRANSITION_CHARGE, t.TRANSITION_TYPE, t.TRANSITION_ORDINAL, t.TRANSITION_ID, p.PRECURSOR_DECOY, p.RUN_ID, p.FEATURE_MS2_AREA_INTENSITY
+        """
+        return con.execute(query).fetchdf()
+
     def _read_standard_data(self, con) -> pd.DataFrame:
         """
         Read standard OpenSWATH data without IPF from split files.

diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-False-RT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-False-RT].out
diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-False-iRT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-False-iRT].out
diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-True-RT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-True-RT].out
diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-True-iRT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-True-iRT].out
diff --git a/.../_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-False-RT].out b/.../_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-False-RT].out
diff --git a/..._regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-False-iRT].out b/..._regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-False-iRT].out
diff --git a/...s/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-True-RT].out b/...s/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-True-RT].out
diff --git a/.../_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-True-iRT].out b/.../_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-True-iRT].out