From fbbffd34b8a359b54b743799083d7b0cbce756b0 Mon Sep 17 00:00:00 2001
From: Joshua Charkow <joshuacharkow@gmail.com>
Date: Tue, 17 Jun 2025 16:04:04 -0400
Subject: [PATCH 01/25] feature: start implementation of lib export with
 pyprophet

---
 pyprophet/cli/export.py              | 73 ++++++++++++++++++++++++++++
 pyprophet/io/_base.py                | 11 +++++
 pyprophet/io/export/split_parquet.py | 50 +++++++++++++++++++
 3 files changed, 134 insertions(+)

diff --git a/pyprophet/cli/export.py b/pyprophet/cli/export.py
index 9ca6a3e1..083ce272 100644
--- a/pyprophet/cli/export.py
+++ b/pyprophet/cli/export.py
@@ -37,6 +37,7 @@ def export():
         pass
 
     export.add_command(export_tsv, name="tsv")
+    export.add_command(export_library, name='library')
     export.add_command(export_matrix, name="matrix")
     export.add_command(export_parquet, name="parquet")
     export.add_command(export_compound, name="compound")
@@ -347,6 +348,78 @@ def export_matrix(
     df = reader.read()
     writer.export_quant_matrix(df)
 
+# Export to Library to be used in OpenSWATH
+@click.command(name="library", cls=AdvancedHelpCommand)
+@click.option(
+    "--in",
+    "infile",
+    required=True,
+    type=click.Path(exists=True),
+    help="PyProphet OSW input file.",
+)
+@click.option(
+    "--out",
+    "outfile",
+    required=False,
+    type=click.Path(exists=False),
+    help="Output tsv library.",
+)
+@click.option(
+    "--max_peakgroup_qvalue",
+    default=0.01,
+    show_default=True,
+    type=float,
+    help="Filter results to maximum run-specific peak group-level q-value, should not use values > 0.01.",
+)
+@click.option(
+    "--max_global_peptide_qvalue",
+    default=0.01,
+    show_default=True,
+    type=float,
+    help="Filter results to maximum global peptide-level q-value, should not use values > 0.01.",
+)
+@click.option(
+    "--max_global_protein_qvalue",
+    default=0.01,
+    show_default=True,
+    type=float,
+    help="Filter results to maximum global protein-level q-value, should not use values > 0.01.",
+)
+@measure_memory_usage_and_time
+def export_library(
+    infile,
+    outfile,
+    max_peakgroup_qvalue,
+    max_global_peptide_qvalue,
+    max_global_protein_qvalue,
+):
+    """
+    Export OSW to tsv library format
+    """
+    config = ExportIOConfig(
+        infile=infile,
+        outfile=outfile,
+        subsample_ratio=1.0,  # Not used in export
+        level="export",
+        context="export",
+        export_format=format,
+        out_type="tsv",
+        max_rs_peakgroup_qvalue=max_peakgroup_qvalue,
+        max_global_peptide_qvalue=max_global_peptide_qvalue,
+        max_global_protein_qvalue=max_global_protein_qvalue,
+    )
+
+    reader = ReaderDispatcher.get_reader(config)
+    writer = WriterDispatcher.get_writer(config)
+
+    df = reader.read_for_library()
+    logger.debug(df.columns)
+    logger.info(f"Library Contains {len(df['Precursor'].drop_duplicates())} Precursors")
+    logger.info(f"Precursor Fragment Distribution")
+    num_frags_per_prec = df[['Precursor', 'Annotation']].groupby("Precursor").count().reset_index(names='Precursor').groupby('Annotation').count()
+    for frag, count in num_frags_per_prec.iterrows():
+        logger.info(f"There are {count['Precursor']} precursors with {frag} fragment(s)")
+    writer.export_library(df)
 
 # Export to Parquet
 @click.command(name="parquet", cls=AdvancedHelpCommand)
diff --git a/pyprophet/io/_base.py b/pyprophet/io/_base.py
index 66588353..d1735fe2 100644
--- a/pyprophet/io/_base.py
+++ b/pyprophet/io/_base.py
@@ -617,6 +617,17 @@ def export_results(self, data: pd.DataFrame):
         else:
             raise ValueError(f"Unsupported export format: {cfg.export_format}")
 
+    def export_library(self, data: pd.DataFrame) -> pd.DataFrame:
+        """
+        Export library data at specified level.
+
+        Args:
+            data: Input DataFrame with library data
+
+        """
+        cfg = self.config
+        data.to_csv(cfg.outfile, sep='\t', index=False)
+
     def export_quant_matrix(self, data: pd.DataFrame) -> pd.DataFrame:
         """
         Export quantification matrix at specified level with optional normalization.
diff --git a/pyprophet/io/export/split_parquet.py b/pyprophet/io/export/split_parquet.py
index 6137d533..4e05b188 100644
--- a/pyprophet/io/export/split_parquet.py
+++ b/pyprophet/io/export/split_parquet.py
@@ -85,6 +85,23 @@ def read(self) -> pd.DataFrame:
             return self._augment_data(data, con)
         finally:
             con.close()
+    
+    def read_for_library(self) -> pd.DataFrame:
+        """
+        Read data specifically for library generation, which may not include all features.
+        """
+        con = duckdb.connect()
+        try:
+            self._init_duckdb_views(con)
+
+            if self._is_unscored_file():
+                raise logger.exception("Files must be scored for library generation.")
+            
+            logger.info("Reading standard OpenSWATH data for library from split Parquet files.")
+            return self._read_library_data(con)
+        finally:
+            con.close()
+
 
     def _is_unscored_file(self) -> bool:
         """
@@ -257,6 +274,39 @@ def _read_augmented_data(self, con) -> pd.DataFrame:
 
         return pd.merge(data, ipf_data, on="id", how="left")
 
+    def _read_library_data(self, con) -> pd.DataFrame:
+        """
+        Read data specifically for precursors for library generation. This does not include all output in standard output
+        """
+        logger.debug("Reading library data!!!!!")
+        query = f"""
+            SELECT
+                p.EXP_RT AS RT,
+                p.UNMODIFIED_SEQUENCE AS PeptideSequence,
+                p.MODIFIED_SEQUENCE AS ModifiedPeptideSequence,
+                p.PRECURSOR_CHARGE AS Charge,
+                (p.MODIFIED_SEQUENCE || '_' || CAST(p.PRECURSOR_ID AS VARCHAR)) AS Precursor,
+                p.PRECURSOR_MZ AS mz,
+                p.FEATURE_MS2_AREA_INTENSITY AS Intensity,
+                t.FEATURE_ID AS id,
+                t.FEATURE_TRANSITION_AREA_INTENSITY AS FragmentIonIntensity,
+                t.ANNOTATION as Annotation,
+                t.PRODUCT_MZ as ProductMz,
+                t.TRANSITION_CHARGE as FragmentCharge,
+                t.TRANSITION_TYPE as FragmentType,
+                t.TRANSITION_ORDINAL as FragmentSeriesNumber
+            FROM precursors p
+            INNER JOIN transition t ON p.FEATURE_ID = t.FEATURE_ID
+            WHERE p.PRECURSOR_DECOY is false and t.TRANSITION_DECOY is false and
+                  p.SCORE_MS2_Q_VALUE < {self.config.max_rs_peakgroup_qvalue} and
+                  p.SCORE_PROTEIN_GLOBAL_Q_VALUE < {self.config.max_global_protein_qvalue} and
+                  p.SCORE_PEPTIDE_GLOBAL_Q_VALUE < {self.config.max_global_peptide_qvalue} and
+                  p.SCORE_MS2_PEAK_GROUP_RANK = 1
+
+            ORDER BY p.FEATURE_ID
+        """
+        return con.execute(query).fetchdf()
+    
     def _read_standard_data(self, con) -> pd.DataFrame:
         """
         Read standard OpenSWATH data without IPF from split files.

From 02d5d650272a38cd12355adec7fcdd2d050b7bd3 Mon Sep 17 00:00:00 2001
From: Joshua Charkow <joshuacharkow@gmail.com>
Date: Wed, 18 Jun 2025 09:30:03 -0400
Subject: [PATCH 02/25] more functionality to lib export

Add options to calibrate/not calibrate IM,RT, MS2 Frag
---
 pyprophet/_config.py                 | 12 ++++++++
 pyprophet/cli/export.py              | 41 +++++++++++++++++++++++-----
 pyprophet/io/_base.py                | 34 +++++++++++++++++++++--
 pyprophet/io/export/split_parquet.py | 29 +++++++++++++++-----
 4 files changed, 100 insertions(+), 16 deletions(-)

diff --git a/pyprophet/_config.py b/pyprophet/_config.py
index 41ec4b1e..acc0791d 100644
--- a/pyprophet/_config.py
+++ b/pyprophet/_config.py
@@ -662,6 +662,12 @@ class ExportIOConfig(BaseIOConfig):
 
         # SqMass: Export to parquet
         pqp_file (Optional[str]): Path to PQP file for precursor/transition mapping.
+
+        # Export to library
+        rt_calibration (bool): If True, will use emperical RT values as oppose to the original library RT values
+        im_calibration (bool): If True, will use emperical IM values as oppose to the original library IM values
+        intensity_calibration (bool): If True, will use emperical intensity values as oppose to the original library intensity values
+        min_fragments (int): Minimum number of fragments required to include the peak group in the library, only relevant if intensity_calibration is True
     """
 
     export_format: Literal[
@@ -691,3 +697,9 @@ class ExportIOConfig(BaseIOConfig):
 
     # SqMass: Export to parquet
     pqp_file: Optional[str] = None  # Path to PQP file for precursor/transition mapping
+
+    # Export to library
+    rt_calibration: bool = True
+    im_calibration: bool = True
+    intensity_calibration: bool = True
+    min_fragments: int = 6
\ No newline at end of file
diff --git a/pyprophet/cli/export.py b/pyprophet/cli/export.py
index 083ce272..ecb59fff 100644
--- a/pyprophet/cli/export.py
+++ b/pyprophet/cli/export.py
@@ -385,6 +385,31 @@ def export_matrix(
     type=float,
     help="Filter results to maximum global protein-level q-value, should not use values > 0.01.",
 )
+@click.option(
+    "--rt_calibration/--no-rt_calibration",
+    default=True,
+    show_default=True,
+    help="Use empirical RT values as oppose to the original library RT values."
+)
+@click.option(
+    "--im_calibration/--no-im_calibration",
+    default=True,
+    show_default=True,
+    help="Use empirical IM values as oppose to the original library IM values."
+)
+@click.option(
+    "--intensity_calibration/--no-intensity_calibration",
+    default=True,
+    show_default=True,
+    help="Use empirical intensity values as oppose to the original library intensity values."
+)
+@click.option(
+    "--min_fragments",
+    default=3,
+    show_default=True,
+    type=int,
+    help="Minimum number of fragments required to include the peak group in the library, only relevant if intensityCalibration is True."
+)
 @measure_memory_usage_and_time
 def export_library(
     infile,
@@ -392,6 +417,10 @@ def export_library(
     max_peakgroup_qvalue,
     max_global_peptide_qvalue,
     max_global_protein_qvalue,
+    rt_calibration,
+    im_calibration,
+    intensity_calibration,
+    min_fragments,
 ):
     """
     Export OSW to tsv library format
@@ -407,19 +436,17 @@ def export_library(
         max_rs_peakgroup_qvalue=max_peakgroup_qvalue,
         max_global_peptide_qvalue=max_global_peptide_qvalue,
         max_global_protein_qvalue=max_global_protein_qvalue,
+        rt_calibration=rt_calibration,
+        im_calibration=im_calibration,
+        intensity_calibration=intensity_calibration,
+        min_fragments=min_fragments,
     )
 
     reader = ReaderDispatcher.get_reader(config)
     writer = WriterDispatcher.get_writer(config)
 
     df = reader.read_for_library()
-    logger.debug(df.columns)
-    logger.info(f"Library Contains {len(df['Precursor'].drop_duplicates())} Precursors")
-    logger.info(f"Precursor Fragment Distribution")
-    num_frags_per_prec = df[['Precursor', 'Annotation']].groupby("Precursor").count().reset_index(names='Precursor').groupby('Annotation').count()
-    for frag, count in num_frags_per_prec.iterrows():
-        logger.info(f"There are {count['Precursor']} precursors with {frag} fragment(s)")
-    writer.export_library(df)
+    writer.clean_and_export_library(df)
 
 # Export to Parquet
 @click.command(name="parquet", cls=AdvancedHelpCommand)
diff --git a/pyprophet/io/_base.py b/pyprophet/io/_base.py
index d1735fe2..7ef01710 100644
--- a/pyprophet/io/_base.py
+++ b/pyprophet/io/_base.py
@@ -617,15 +617,45 @@ def export_results(self, data: pd.DataFrame):
         else:
             raise ValueError(f"Unsupported export format: {cfg.export_format}")
 
-    def export_library(self, data: pd.DataFrame) -> pd.DataFrame:
+    def clean_and_export_library(self, data: pd.DataFrame) -> pd.DataFrame:
         """
-        Export library data at specified level.
+        This function cleans the original dataframe and exports the library
 
         Args:
             data: Input DataFrame with library data
 
         """
         cfg = self.config
+
+        # For precursors found in more than one run, select the run with the smallest q value
+        data = data.sort_values(by='Q_Value').groupby("TransitionId").head(1)
+        assert (len(data['TransitionId'].drop_duplicates()) == len(data))
+
+        # Remove Annotation Column if all NAN
+        if data['Annotation'].isnull().all() or data['Annotation'].eq("NA").all():
+            logger.debug("Annotation column is empty, dropping it.")
+            data.drop(columns=['Annotation'], inplace=True)
+
+        import sklearn.preprocessing as preprocessing
+        if cfg.rt_calibration:
+            data['NormalizedRetentionTime'] = preprocessing.MinMaxScaler().fit_transform(data[['NormalizedRetentionTime']]) * 100
+        if cfg.intensity_calibration:
+            data['LibraryIntensity'] = (
+            data['LibraryIntensity'] /
+            data.groupby('Precursor')['LibraryIntensity'].transform('max') *
+            10000)
+
+        
+        ## Print Library statistics
+        logger.info(f"Library Contains {len(data['Precursor'].drop_duplicates())} Precursors")
+        logger.info(f"Precursor Fragment Distribution")
+        num_frags_per_prec = data[['Precursor', 'TransitionId']].groupby("Precursor").count().reset_index(names='Precursor').groupby('TransitionId').count()
+        for frag, count in num_frags_per_prec.iterrows():
+            logger.info(f"There are {count['Precursor']} precursors with {frag} fragment(s)")
+        
+        data.drop(columns=['TransitionId', 'Q_Value'], inplace=True)
+
+        logger.info("Exporting library to file.")
         data.to_csv(cfg.outfile, sep='\t', index=False)
 
     def export_quant_matrix(self, data: pd.DataFrame) -> pd.DataFrame:
diff --git a/pyprophet/io/export/split_parquet.py b/pyprophet/io/export/split_parquet.py
index 4e05b188..550b5591 100644
--- a/pyprophet/io/export/split_parquet.py
+++ b/pyprophet/io/export/split_parquet.py
@@ -278,23 +278,38 @@ def _read_library_data(self, con) -> pd.DataFrame:
         """
         Read data specifically for precursors for library generation. This does not include all output in standard output
         """
-        logger.debug("Reading library data!!!!!")
+        if self.config.rt_calibration:
+            rt_query = "p.Norm_RT as NormalizedRetentionTime"
+        else:
+            rt_query = "p.EXP_RT as NormalizedRetentionTime"
+
+        if self.config.im_calibration:
+            im_query = "p.PRECURSOR_LIBRARY_DRIFT_TIME as PrecursorIonMobility"
+        else:
+            im_query = "p.EXP_IM as PrecursorIonMobility"
+
+        if self.config.intensity_calibration:
+            intensity_query = 't.FEATURE_TRANSITION_AREA_INTENSITY AS LibraryIntensity'
+        else:
+            intensity_query = 't.TRANSITION_LIBRARY_INTENSITY AS LibraryIntensity'
+
         query = f"""
             SELECT
-                p.EXP_RT AS RT,
+                {rt_query},
+                {im_query},
+                {intensity_query},
+                p.SCORE_MS2_Q_VALUE as Q_Value,
                 p.UNMODIFIED_SEQUENCE AS PeptideSequence,
                 p.MODIFIED_SEQUENCE AS ModifiedPeptideSequence,
                 p.PRECURSOR_CHARGE AS Charge,
                 (p.MODIFIED_SEQUENCE || '_' || CAST(p.PRECURSOR_ID AS VARCHAR)) AS Precursor,
-                p.PRECURSOR_MZ AS mz,
-                p.FEATURE_MS2_AREA_INTENSITY AS Intensity,
-                t.FEATURE_ID AS id,
-                t.FEATURE_TRANSITION_AREA_INTENSITY AS FragmentIonIntensity,
+                p.PRECURSOR_MZ AS PrecursorMz,
                 t.ANNOTATION as Annotation,
                 t.PRODUCT_MZ as ProductMz,
                 t.TRANSITION_CHARGE as FragmentCharge,
                 t.TRANSITION_TYPE as FragmentType,
-                t.TRANSITION_ORDINAL as FragmentSeriesNumber
+                t.TRANSITION_ORDINAL as FragmentSeriesNumber,
+                t.TRANSITION_ID as TransitionId
             FROM precursors p
             INNER JOIN transition t ON p.FEATURE_ID = t.FEATURE_ID
             WHERE p.PRECURSOR_DECOY is false and t.TRANSITION_DECOY is false and

From 1ef91f99ae851a05ca4d72291216adb0692ebf22 Mon Sep 17 00:00:00 2001
From: Joshua Charkow <joshuacharkow@gmail.com>
Date: Wed, 9 Jul 2025 11:50:24 -0400
Subject: [PATCH 03/25] change default min frags to 4

---
 pyprophet/cli/export.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyprophet/cli/export.py b/pyprophet/cli/export.py
index ecb59fff..6acea357 100644
--- a/pyprophet/cli/export.py
+++ b/pyprophet/cli/export.py
@@ -405,7 +405,7 @@ def export_matrix(
 )
 @click.option(
     "--min_fragments",
-    default=3,
+    default=4,
     show_default=True,
     type=int,
     help="Minimum number of fragments required to include the peak group in the library, only relevant if intensityCalibration is True."

From 0c999a665a03a53f7356bd93dbead9ff84fe33a5 Mon Sep 17 00:00:00 2001
From: Joshua Charkow <joshuacharkow@gmail.com>
Date: Wed, 9 Jul 2025 12:27:28 -0400
Subject: [PATCH 04/25] filter fragments with 0 library intensity

---
 pyprophet/io/_base.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/pyprophet/io/_base.py b/pyprophet/io/_base.py
index 7ef01710..750114b6 100644
--- a/pyprophet/io/_base.py
+++ b/pyprophet/io/_base.py
@@ -644,15 +644,23 @@ def clean_and_export_library(self, data: pd.DataFrame) -> pd.DataFrame:
             data['LibraryIntensity'] /
             data.groupby('Precursor')['LibraryIntensity'].transform('max') *
             10000)
-
+            data = data[data['LibraryIntensity'] > 0] # Remove rows with zero intensity
         
         ## Print Library statistics
         logger.info(f"Library Contains {len(data['Precursor'].drop_duplicates())} Precursors")
-        logger.info(f"Precursor Fragment Distribution")
+
+        logger.info(f"Precursor Fragment Distribution (Before Filtering)")
         num_frags_per_prec = data[['Precursor', 'TransitionId']].groupby("Precursor").count().reset_index(names='Precursor').groupby('TransitionId').count()
         for frag, count in num_frags_per_prec.iterrows():
             logger.info(f"There are {count['Precursor']} precursors with {frag} fragment(s)")
         
+        logger.info(f"Filter library to precursors containing {cfg.min_fragments} or more fragments")
+        ids_to_keep = data[['Precursor', 'Annotation']].groupby('Precursor').count()
+        ids_to_keep = ids_to_keep[ ids_to_keep['Annotation'] >= cfg.min_fragments ].index
+        data = data[ data['Precursor'].isin(ids_to_keep) ]
+
+        logger.info(f"After filtering, library contains {len(data['Precursor'].drop_duplicates())} Precursors")
+        
         data.drop(columns=['TransitionId', 'Q_Value'], inplace=True)
 
         logger.info("Exporting library to file.")

From 0a6e7db0276963229b66b8780b9348107aed5ac6 Mon Sep 17 00:00:00 2001
From: Joshua Charkow <joshuacharkow@gmail.com>
Date: Wed, 9 Jul 2025 12:28:10 -0400
Subject: [PATCH 05/25] require a --out parameter

---
 pyprophet/cli/export.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyprophet/cli/export.py b/pyprophet/cli/export.py
index 6acea357..3c0fc825 100644
--- a/pyprophet/cli/export.py
+++ b/pyprophet/cli/export.py
@@ -360,7 +360,7 @@ def export_matrix(
 @click.option(
     "--out",
     "outfile",
-    required=False,
+    required=True, # need to name the library or else get error in os.path.splittext line 75, in __post_init__in _base.
     type=click.Path(exists=False),
     help="Output tsv library.",
 )

From 5f41220b9e17854b913410e6aaa7ea2bc2a0e265 Mon Sep 17 00:00:00 2001
From: Joshua Charkow <joshuacharkow@gmail.com>
Date: Wed, 9 Jul 2025 12:28:27 -0400
Subject: [PATCH 06/25] change config from 6 to 4

---
 pyprophet/_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyprophet/_config.py b/pyprophet/_config.py
index acc0791d..1cf5ec58 100644
--- a/pyprophet/_config.py
+++ b/pyprophet/_config.py
@@ -702,4 +702,4 @@ class ExportIOConfig(BaseIOConfig):
     rt_calibration: bool = True
     im_calibration: bool = True
     intensity_calibration: bool = True
-    min_fragments: int = 6
\ No newline at end of file
+    min_fragments: int = 4
\ No newline at end of file

From d725eb83d10e8ef0cc403543781e7f553a7f0d01 Mon Sep 17 00:00:00 2001
From: Joshua Charkow <joshuacharkow@gmail.com>
Date: Thu, 10 Jul 2025 10:53:48 -0400
Subject: [PATCH 07/25] fix bugs, update docs

---
 pyprophet/io/export/split_parquet.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/pyprophet/io/export/split_parquet.py b/pyprophet/io/export/split_parquet.py
index 550b5591..a4dc0ab1 100644
--- a/pyprophet/io/export/split_parquet.py
+++ b/pyprophet/io/export/split_parquet.py
@@ -88,7 +88,7 @@ def read(self) -> pd.DataFrame:
     
     def read_for_library(self) -> pd.DataFrame:
         """
-        Read data specifically for library generation, which may not include all features.
+        Read data specifically for library generation
         """
         con = duckdb.connect()
         try:
@@ -279,14 +279,14 @@ def _read_library_data(self, con) -> pd.DataFrame:
         Read data specifically for precursors for library generation. This does not include all output in standard output
         """
         if self.config.rt_calibration:
-            rt_query = "p.Norm_RT as NormalizedRetentionTime"
-        else:
             rt_query = "p.EXP_RT as NormalizedRetentionTime"
+        else:
+            rt_query = "p.PRECURSOR_LIBRARY_RT as NormalizedRetentionTime"
 
         if self.config.im_calibration:
-            im_query = "p.PRECURSOR_LIBRARY_DRIFT_TIME as PrecursorIonMobility"
-        else:
             im_query = "p.EXP_IM as PrecursorIonMobility"
+        else:
+            im_query = "p.PRECURSOR_LIBRARY_DRIFT_TIME as PrecursorIonMobility"
 
         if self.config.intensity_calibration:
             intensity_query = 't.FEATURE_TRANSITION_AREA_INTENSITY AS LibraryIntensity'
@@ -301,8 +301,8 @@ def _read_library_data(self, con) -> pd.DataFrame:
                 p.SCORE_MS2_Q_VALUE as Q_Value,
                 p.UNMODIFIED_SEQUENCE AS PeptideSequence,
                 p.MODIFIED_SEQUENCE AS ModifiedPeptideSequence,
-                p.PRECURSOR_CHARGE AS Charge,
-                (p.MODIFIED_SEQUENCE || '_' || CAST(p.PRECURSOR_ID AS VARCHAR)) AS Precursor,
+                p.PRECURSOR_CHARGE AS PrecursorCharge,
+                (p.MODIFIED_SEQUENCE || '_' || CAST(p.PRECURSOR_CHARGE AS VARCHAR)) AS Precursor,
                 p.PRECURSOR_MZ AS PrecursorMz,
                 t.ANNOTATION as Annotation,
                 t.PRODUCT_MZ as ProductMz,

From 9b108aafbb62cf3cfa1b778435ee3627fa5026a2 Mon Sep 17 00:00:00 2001
From: Joshua Charkow <joshuacharkow@gmail.com>
Date: Thu, 10 Jul 2025 12:10:24 -0400
Subject: [PATCH 08/25] fix: export protein info in lib

---
 pyprophet/io/export/split_parquet.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/pyprophet/io/export/split_parquet.py b/pyprophet/io/export/split_parquet.py
index a4dc0ab1..0e77133f 100644
--- a/pyprophet/io/export/split_parquet.py
+++ b/pyprophet/io/export/split_parquet.py
@@ -279,31 +279,32 @@ def _read_library_data(self, con) -> pd.DataFrame:
         Read data specifically for precursors for library generation. This does not include all output in standard output
         """
         if self.config.rt_calibration:
-            rt_query = "p.EXP_RT as NormalizedRetentionTime"
+            rt_col = "p.EXP_RT"
         else:
-            rt_query = "p.PRECURSOR_LIBRARY_RT as NormalizedRetentionTime"
+            rt_col = "p.PRECURSOR_LIBRARY_RT"
 
         if self.config.im_calibration:
-            im_query = "p.EXP_IM as PrecursorIonMobility"
+            im_col = "p.EXP_IM"
         else:
-            im_query = "p.PRECURSOR_LIBRARY_DRIFT_TIME as PrecursorIonMobility"
+            im_col = "p.PRECURSOR_LIBRARY_DRIFT_TIME"
 
         if self.config.intensity_calibration:
-            intensity_query = 't.FEATURE_TRANSITION_AREA_INTENSITY AS LibraryIntensity'
+            intensity_col = 't.FEATURE_TRANSITION_AREA_INTENSITY'
         else:
-            intensity_query = 't.TRANSITION_LIBRARY_INTENSITY AS LibraryIntensity'
+            intensity_col = 't.TRANSITION_LIBRARY_INTENSITY'
 
         query = f"""
             SELECT
-                {rt_query},
-                {im_query},
-                {intensity_query},
+                {rt_col} as NormalizedRetentionTime,
+                {im_col} as PrecursorIonMobility,
+                {intensity_col} as LibraryIntensity,
                 p.SCORE_MS2_Q_VALUE as Q_Value,
                 p.UNMODIFIED_SEQUENCE AS PeptideSequence,
                 p.MODIFIED_SEQUENCE AS ModifiedPeptideSequence,
                 p.PRECURSOR_CHARGE AS PrecursorCharge,
                 (p.MODIFIED_SEQUENCE || '_' || CAST(p.PRECURSOR_CHARGE AS VARCHAR)) AS Precursor,
                 p.PRECURSOR_MZ AS PrecursorMz,
+                STRING_AGG(p.PROTEIN_ACCESSION, ';') AS ProteinName,
                 t.ANNOTATION as Annotation,
                 t.PRODUCT_MZ as ProductMz,
                 t.TRANSITION_CHARGE as FragmentCharge,
@@ -318,7 +319,10 @@ def _read_library_data(self, con) -> pd.DataFrame:
                   p.SCORE_PEPTIDE_GLOBAL_Q_VALUE < {self.config.max_global_peptide_qvalue} and
                   p.SCORE_MS2_PEAK_GROUP_RANK = 1
 
-            ORDER BY p.FEATURE_ID
+            GROUP BY {rt_col}, {im_col}, {intensity_col}, p.SCORE_MS2_Q_VALUE,
+                     p.UNMODIFIED_SEQUENCE, p.MODIFIED_SEQUENCE, p.PRECURSOR_CHARGE,
+                     p.PRECURSOR_MZ, p.FEATURE_ID, t.ANNOTATION, t.PRODUCT_MZ,
+                     t.TRANSITION_CHARGE, t.TRANSITION_TYPE, t.TRANSITION_ORDINAL, t.TRANSITION_ID
         """
         return con.execute(query).fetchdf()
     

From e36e23f03564e1621223f332384fe15b4f4e022b Mon Sep 17 00:00:00 2001
From: Joshua Charkow <joshuacharkow@gmail.com>
Date: Fri, 11 Jul 2025 14:00:57 -0400
Subject: [PATCH 09/25] fix: lib export compute annotation  col if empty

---
 pyprophet/io/_base.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pyprophet/io/_base.py b/pyprophet/io/_base.py
index 750114b6..bde41930 100644
--- a/pyprophet/io/_base.py
+++ b/pyprophet/io/_base.py
@@ -633,8 +633,9 @@ def clean_and_export_library(self, data: pd.DataFrame) -> pd.DataFrame:
 
         # Remove Annotation Column if all NAN
         if data['Annotation'].isnull().all() or data['Annotation'].eq("NA").all():
-            logger.debug("Annotation column is empty, dropping it.")
+            logger.debug("Annotation column is empty, so computing it manually.")
             data.drop(columns=['Annotation'], inplace=True)
+            data['Annotation'] = data['FragmentType'] + data['FragmentSeriesNumber'].astype(str) + '^' + data['FragmentCharge'].astype(str)
 
         import sklearn.preprocessing as preprocessing
         if cfg.rt_calibration:

From 76bbff78dbb50f137c5373a928bb775ce582ed26 Mon Sep 17 00:00:00 2001
From: Joshua Charkow <joshuacharkow@gmail.com>
Date: Wed, 30 Jul 2025 12:10:52 -0400
Subject: [PATCH 10/25] feature: option to keep significant decoys in lib
 refinement

---
 pyprophet/_config.py                 |  4 +++-
 pyprophet/cli/export.py              |  9 +++++++++
 pyprophet/io/_base.py                |  4 +++-
 pyprophet/io/export/split_parquet.py | 12 +++++++++---
 4 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/pyprophet/_config.py b/pyprophet/_config.py
index 1cf5ec58..4f2375d2 100644
--- a/pyprophet/_config.py
+++ b/pyprophet/_config.py
@@ -668,6 +668,7 @@ class ExportIOConfig(BaseIOConfig):
         im_calibration (bool): If True, will use emperical IM values as oppose to the original library IM values
         intensity_calibration (bool): If True, will use emperical intensity values as oppose to the original library intensity values
         min_fragments (int): Minimum number of fragments required to include the peak group in the library, only relevant if intensity_calibration is True
+        keep_decoys (bool): Whether to keep decoy entries in the library, will only keep decoys that pass the thresholds specified
     """
 
     export_format: Literal[
@@ -702,4 +703,5 @@ class ExportIOConfig(BaseIOConfig):
     rt_calibration: bool = True
     im_calibration: bool = True
     intensity_calibration: bool = True
-    min_fragments: int = 4
\ No newline at end of file
+    min_fragments: int = 4
+    keep_decoys: bool = False  # Whether to keep decoy entries in the library
\ No newline at end of file
diff --git a/pyprophet/cli/export.py b/pyprophet/cli/export.py
index 3c0fc825..439a2d40 100644
--- a/pyprophet/cli/export.py
+++ b/pyprophet/cli/export.py
@@ -410,6 +410,13 @@ def export_matrix(
     type=int,
     help="Minimum number of fragments required to include the peak group in the library, only relevant if intensityCalibration is True."
 )
+@click.option(
+    "--keep_decoys/--no-keep_decoys",
+    default=True,
+    show_default=True,
+    type=bool,
+    help="Whether to keep decoys in the exported library. Default is False, which means decoys are filtered out. Only keeps decoys passing thresholds specified above"
+)
 @measure_memory_usage_and_time
 def export_library(
     infile,
@@ -421,6 +428,7 @@ def export_library(
     im_calibration,
     intensity_calibration,
     min_fragments,
+    keep_decoys
 ):
     """
     Export OSW to tsv library format
@@ -440,6 +448,7 @@ def export_library(
         im_calibration=im_calibration,
         intensity_calibration=intensity_calibration,
         min_fragments=min_fragments,
+        keep_decoys=keep_decoys
     )
 
     reader = ReaderDispatcher.get_reader(config)
diff --git a/pyprophet/io/_base.py b/pyprophet/io/_base.py
index 26613e36..090330ec 100644
--- a/pyprophet/io/_base.py
+++ b/pyprophet/io/_base.py
@@ -663,7 +663,9 @@ def clean_and_export_library(self, data: pd.DataFrame) -> pd.DataFrame:
         data = data[ data['Precursor'].isin(ids_to_keep) ]
 
         logger.info(f"After filtering, library contains {len(data['Precursor'].drop_duplicates())} Precursors")
-        
+        if cfg.keep_decoys:
+            logger.info("Of Which {} are decoys".format(len(data[data['Decoy'] == 1]['Precursor'].drop_duplicates())))
+
         data.drop(columns=['TransitionId', 'Q_Value'], inplace=True)
 
         logger.info("Exporting library to file.")
diff --git a/pyprophet/io/export/split_parquet.py b/pyprophet/io/export/split_parquet.py
index 0e77133f..73c0f3e9 100644
--- a/pyprophet/io/export/split_parquet.py
+++ b/pyprophet/io/export/split_parquet.py
@@ -292,6 +292,11 @@ def _read_library_data(self, con) -> pd.DataFrame:
             intensity_col = 't.FEATURE_TRANSITION_AREA_INTENSITY'
         else:
             intensity_col = 't.TRANSITION_LIBRARY_INTENSITY'
+        
+        if self.config.keep_decoys:
+            decoy_query = ""
+        else:
+            decoy_query ="p.PRECURSOR_DECOY is false and t.TRANSITION_DECOY is false and" 
 
         query = f"""
             SELECT
@@ -310,10 +315,11 @@ def _read_library_data(self, con) -> pd.DataFrame:
                 t.TRANSITION_CHARGE as FragmentCharge,
                 t.TRANSITION_TYPE as FragmentType,
                 t.TRANSITION_ORDINAL as FragmentSeriesNumber,
-                t.TRANSITION_ID as TransitionId
+                t.TRANSITION_ID as TransitionId,
+                p.PRECURSOR_DECOY as Decoy
             FROM precursors p
             INNER JOIN transition t ON p.FEATURE_ID = t.FEATURE_ID
-            WHERE p.PRECURSOR_DECOY is false and t.TRANSITION_DECOY is false and
+            WHERE {decoy_query} 
                   p.SCORE_MS2_Q_VALUE < {self.config.max_rs_peakgroup_qvalue} and
                   p.SCORE_PROTEIN_GLOBAL_Q_VALUE < {self.config.max_global_protein_qvalue} and
                   p.SCORE_PEPTIDE_GLOBAL_Q_VALUE < {self.config.max_global_peptide_qvalue} and
@@ -322,7 +328,7 @@ def _read_library_data(self, con) -> pd.DataFrame:
             GROUP BY {rt_col}, {im_col}, {intensity_col}, p.SCORE_MS2_Q_VALUE,
                      p.UNMODIFIED_SEQUENCE, p.MODIFIED_SEQUENCE, p.PRECURSOR_CHARGE,
                      p.PRECURSOR_MZ, p.FEATURE_ID, t.ANNOTATION, t.PRODUCT_MZ,
-                     t.TRANSITION_CHARGE, t.TRANSITION_TYPE, t.TRANSITION_ORDINAL, t.TRANSITION_ID
+                     t.TRANSITION_CHARGE, t.TRANSITION_TYPE, t.TRANSITION_ORDINAL, t.TRANSITION_ID, p.PRECURSOR_DECOY
         """
         return con.execute(query).fetchdf()
     

From f8b3753c00063f8bf2bf35ae555bcc0b061ca2cf Mon Sep 17 00:00:00 2001
From: Joshua Charkow <joshuacharkow@gmail.com>
Date: Thu, 7 Aug 2025 17:27:15 -0400
Subject: [PATCH 11/25] verbose: note that keep_decoys in lib gen is
 experimental feature

---
 pyprophet/cli/export.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyprophet/cli/export.py b/pyprophet/cli/export.py
index 439a2d40..da6f30a5 100644
--- a/pyprophet/cli/export.py
+++ b/pyprophet/cli/export.py
@@ -415,7 +415,7 @@ def export_matrix(
     default=True,
     show_default=True,
     type=bool,
-    help="Whether to keep decoys in the exported library. Default is False, which means decoys are filtered out. Only keeps decoys passing thresholds specified above"
+    help="(Experimental) Whether to keep decoys in the exported library. Default is False, which means decoys are filtered out. Only keeps decoys passing thresholds specified above"
 )
 @measure_memory_usage_and_time
 def export_library(

From 4b9076bc2fd66646643b23d7eaff6d8f9ac0d400 Mon Sep 17 00:00:00 2001
From: Joshua Charkow <joshuacharkow@gmail.com>
Date: Fri, 8 Aug 2025 08:28:03 -0400
Subject: [PATCH 12/25] test: add test for lib generation

---
 pyprophet/_config.py           |  2 ++
 pyprophet/cli/export.py        | 11 +++++++++--
 pyprophet/io/_base.py          |  2 ++
 tests/test_pyprophet_export.py | 34 ++++++++++++++++++++++++++++++++++
 4 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/pyprophet/_config.py b/pyprophet/_config.py
index 4f2375d2..3ad1398b 100644
--- a/pyprophet/_config.py
+++ b/pyprophet/_config.py
@@ -653,6 +653,7 @@ class ExportIOConfig(BaseIOConfig):
         top_n (int): Number of top intense features to use for summarization
         consistent_top (bool): Whether to use same top features across all runs
         normalization (Literal["none", "median", "medianmedian", "quantile"]): Normalization method
+        test: bool = False: Whether to enable test mode with deterministic behavior, test mode will sort libraries by precursor, fragmentType, fragmentSeriesNumber and fragmentCharge
 
         # OSW: Export to parquet
         compression_method (Literal["none", "snappy", "gzip", "brotli", "zstd"]): Compression method for parquet files.
@@ -684,6 +685,7 @@ class ExportIOConfig(BaseIOConfig):
     max_global_peptide_qvalue: float = 0.01
     protein: bool = True
     max_global_protein_qvalue: float = 0.01
+    test: bool = False
 
     # Quantification matrix options
     top_n: int = 3
diff --git a/pyprophet/cli/export.py b/pyprophet/cli/export.py
index da6f30a5..19fb67ea 100644
--- a/pyprophet/cli/export.py
+++ b/pyprophet/cli/export.py
@@ -417,6 +417,11 @@ def export_matrix(
     type=bool,
     help="(Experimental) Whether to keep decoys in the exported library. Default is False, which means decoys are filtered out. Only keeps decoys passing thresholds specified above"
 )
+@click.option(
+    "--test/--no-test",
+    default=False,
+    show_default=True,
+    help="Enable test mode with deterministic behavior, test mode will sort libraries by precursor, fragmentType, fragmentSeriesNumber and fragmentCharge")
 @measure_memory_usage_and_time
 def export_library(
     infile,
@@ -428,7 +433,8 @@ def export_library(
     im_calibration,
     intensity_calibration,
     min_fragments,
-    keep_decoys
+    keep_decoys,
+    test
 ):
     """
     Export OSW to tsv library format
@@ -448,7 +454,8 @@ def export_library(
         im_calibration=im_calibration,
         intensity_calibration=intensity_calibration,
         min_fragments=min_fragments,
-        keep_decoys=keep_decoys
+        keep_decoys=keep_decoys,
+        test=test
     )
 
     reader = ReaderDispatcher.get_reader(config)
diff --git a/pyprophet/io/_base.py b/pyprophet/io/_base.py
index 090330ec..e4f3e6c6 100644
--- a/pyprophet/io/_base.py
+++ b/pyprophet/io/_base.py
@@ -667,6 +667,8 @@ def clean_and_export_library(self, data: pd.DataFrame) -> pd.DataFrame:
             logger.info("Of Which {} are decoys".format(len(data[data['Decoy'] == 1]['Precursor'].drop_duplicates())))
 
         data.drop(columns=['TransitionId', 'Q_Value'], inplace=True)
+        if cfg.test:
+            data = data.sort_values(by=['Precursor', 'FragmentType', 'FragmentSeriesNumber', 'FragmentCharge', 'ProductMz'])
 
         logger.info("Exporting library to file.")
         data.to_csv(cfg.outfile, sep='\t', index=False)
diff --git a/tests/test_pyprophet_export.py b/tests/test_pyprophet_export.py
index 19e0bece..761247ec 100644
--- a/tests/test_pyprophet_export.py
+++ b/tests/test_pyprophet_export.py
@@ -147,6 +147,40 @@ def test_osw_analysis(
         f"{temp_folder}/test_data.tsv",
     )
 
+@pytest.mark.parametrize(
+    "calib",
+    [ True, False]
+)
+def test_osw_analysis_libExport(test_data_split_parquet, temp_folder, regtest, calib
+):
+    # TODO extend to other inputs as well, for now just use split_parquet
+    input_strategy = {
+            "path": test_data_split_parquet,
+            "reader": "parquet_split",
+            "cmd_prefix": f"--in={test_data_split_parquet}",
+        }
+
+    cmd = f"pyprophet score {input_strategy['cmd_prefix']} --level=ms2 --test --pi0_lambda=0.001 0 0 --ss_iteration_fdr=0.02 && "
+
+    # peptide-level
+    cmd += f"pyprophet infer peptide --pi0_lambda=0.001 0 0 {input_strategy['cmd_prefix']} --context=global && "
+
+    # protein-level
+    cmd += f"pyprophet infer protein --pi0_lambda=0 0 0 {input_strategy['cmd_prefix']} --context=global && "
+
+    # export
+    if calib:
+        cmd += f"pyprophet export library {input_strategy['cmd_prefix']} --out={temp_folder}/test_lib.tsv --test --max_peakgroup_qvalue=1 --max_global_peptide_qvalue=1 --max_global_protein_qvalue=1"
+    else:
+        cmd += f"pyprophet export library {input_strategy['cmd_prefix']} --out={temp_folder}/test_lib.tsv --test --max_peakgroup_qvalue=1 --max_global_peptide_qvalue=1 --max_global_protein_qvalue=1 --no-rt_calibration --no-im_calibration --no-intensity_calibration"
+
+    run_pyprophet_command(cmd, temp_folder)
+    validate_export_results(
+        regtest,
+        input_strategy["path"],
+        input_strategy["reader"],
+        f"{temp_folder}/test_lib.tsv",
+    )
 
 def test_osw_unscored(input_strategy, temp_folder, regtest):
     """Test export of unscored OSW data"""

From f455b10fcb48ed114eb3a42dc0cd6ccf02105b7d Mon Sep 17 00:00:00 2001
From: Joshua Charkow <joshuacharkow@gmail.com>
Date: Fri, 8 Aug 2025 08:48:56 -0400
Subject: [PATCH 13/25] minor refactor for better support across different i/o

---
 pyprophet/_config.py                 |  5 +++--
 pyprophet/cli/export.py              |  4 ++--
 pyprophet/io/export/split_parquet.py | 30 ++++++++++++++--------------
 3 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/pyprophet/_config.py b/pyprophet/_config.py
index 3ad1398b..55c98a17 100644
--- a/pyprophet/_config.py
+++ b/pyprophet/_config.py
@@ -635,6 +635,7 @@ class ExportIOConfig(BaseIOConfig):
             - "legacy_split": Split TSV files for each run.
             - "parquet": Single Parquet file with merged results.
             - "parquet_split": Split Parquet files for each run.
+            - "library" : .tsv library file
         out_type (Literal["tsv", "csv"]): Output file type for exported results.
         transition_quantification (bool): Report aggregated transition-level quantification.
         max_transition_pep (float): Maximum PEP to retain scored transitions for quantification (requires transition-level scoring).
@@ -673,7 +674,7 @@ class ExportIOConfig(BaseIOConfig):
     """
 
     export_format: Literal[
-        "matrix", "legacy_merged", "legacy_split", "parquet", "parquet_split"
+        "matrix", "legacy_merged", "legacy_split", "parquet", "parquet_split", "library"
     ] = "legacy_merged"
     out_type: Literal["tsv", "csv"] = "tsv"
     transition_quantification: bool = False
@@ -701,7 +702,7 @@ class ExportIOConfig(BaseIOConfig):
     # SqMass: Export to parquet
     pqp_file: Optional[str] = None  # Path to PQP file for precursor/transition mapping
 
-    # Export to library
+    # Export to library options
     rt_calibration: bool = True
     im_calibration: bool = True
     intensity_calibration: bool = True
diff --git a/pyprophet/cli/export.py b/pyprophet/cli/export.py
index 19fb67ea..bc4a1bc3 100644
--- a/pyprophet/cli/export.py
+++ b/pyprophet/cli/export.py
@@ -445,7 +445,7 @@ def export_library(
         subsample_ratio=1.0,  # Not used in export
         level="export",
         context="export",
-        export_format=format,
+        export_format="library",
         out_type="tsv",
         max_rs_peakgroup_qvalue=max_peakgroup_qvalue,
         max_global_peptide_qvalue=max_global_peptide_qvalue,
@@ -461,7 +461,7 @@ def export_library(
     reader = ReaderDispatcher.get_reader(config)
     writer = WriterDispatcher.get_writer(config)
 
-    df = reader.read_for_library()
+    df = reader.read()
     writer.clean_and_export_library(df)
 
 # Export to Parquet
diff --git a/pyprophet/io/export/split_parquet.py b/pyprophet/io/export/split_parquet.py
index 73c0f3e9..767901fe 100644
--- a/pyprophet/io/export/split_parquet.py
+++ b/pyprophet/io/export/split_parquet.py
@@ -68,6 +68,14 @@ def read(self) -> pd.DataFrame:
         try:
             self._init_duckdb_views(con)
 
+            if self.config.export_format == "library":
+                if self._is_unscored_file():
+                    raise logger.exception("Files must be scored for library generation.")
+                if not self._has_peptide_protein_global_scores():
+                    raise logger.exception("Files must have peptide and protein level global scores for library generation.")
+                logger.info("Reading standard OpenSWATH data for library from split Parquet files.")
+                return self._read_library_data(con)
+
             if self._is_unscored_file():
                 logger.info("Reading unscored data from split Parquet files.")
                 return self._read_unscored_data(con)
@@ -82,26 +90,18 @@ def read(self) -> pd.DataFrame:
                 logger.info("Reading standard OpenSWATH data from split Parquet files.")
                 data = self._read_standard_data(con)
 
-            return self._augment_data(data, con)
+                return self._augment_data(data, con)
         finally:
             con.close()
     
-    def read_for_library(self) -> pd.DataFrame:
+    def _has_peptide_protein_global_scores(self) -> bool:
         """
-        Read data specifically for library generation
+        Check if files contain peptide and protein global scores
         """
-        con = duckdb.connect()
-        try:
-            self._init_duckdb_views(con)
-
-            if self._is_unscored_file():
-                raise logger.exception("Files must be scored for library generation.")
-            
-            logger.info("Reading standard OpenSWATH data for library from split Parquet files.")
-            return self._read_library_data(con)
-        finally:
-            con.close()
-
+        print(self._columns)
+        has_peptide = any(col.startswith("SCORE_PEPTIDE_GLOBAL") for col in self._columns)
+        has_protein = any(col.startswith("SCORE_PROTEIN_GLOBAL") for col in self._columns)
+        return has_peptide and has_protein
 
     def _is_unscored_file(self) -> bool:
         """

From 1667c2578377bba51da8b12d40576dc91f485aaf Mon Sep 17 00:00:00 2001
From: Joshua Charkow <joshuacharkow@gmail.com>
Date: Fri, 8 Aug 2025 09:09:52 -0400
Subject: [PATCH 14/25] add not implemented error for osw/parquet output

also add tests for osw/parquet to test for the not implemented error
---
 pyprophet/io/export/osw.py                    |  3 ++
 pyprophet/io/export/parquet.py                |  3 ++
 ...nalysis_libExport[split_parquet-False].out | 14 ++++++++
 ...analysis_libExport[split_parquet-True].out | 14 ++++++++
 tests/test_pyprophet_export.py                | 32 +++++++++----------
 5 files changed, 50 insertions(+), 16 deletions(-)
 create mode 100644 tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False].out
 create mode 100644 tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True].out

diff --git a/pyprophet/io/export/osw.py b/pyprophet/io/export/osw.py
index 02d1669d..db6e2455 100644
--- a/pyprophet/io/export/osw.py
+++ b/pyprophet/io/export/osw.py
@@ -123,6 +123,9 @@ def _read_sqlite(self, con):
         """Main entry point for reading SQLite data, delegates to specific methods."""
         cfg = self.config
 
+        if self.config.export_format == "library":
+            raise NotImplementedError("Library export from non-split .parquet files is not supported")
+ 
         if self._is_unscored_file(con):
             logger.info("Reading unscored data from Parquet file.")
             return self._read_unscored_data(con)
diff --git a/pyprophet/io/export/parquet.py b/pyprophet/io/export/parquet.py
index 16026cdd..556abdd4 100644
--- a/pyprophet/io/export/parquet.py
+++ b/pyprophet/io/export/parquet.py
@@ -36,6 +36,9 @@ def read(self) -> pd.DataFrame:
         try:
             self._init_duckdb_views(con)
 
+            if self.config.export_format == "library":
+                raise NotImplementedError("Library export from non-split .parquet files is not supported")
+            
             if self._is_unscored_file():
                 logger.info("Reading unscored data from Parquet file.")
                 return self._read_unscored_data(con)
diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False].out
new file mode 100644
index 00000000..f3e17a61
--- /dev/null
+++ b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False].out
@@ -0,0 +1,14 @@
+   Annotation  Decoy  FragmentCharge  FragmentSeriesNumber FragmentType  LibraryIntensity     ModifiedPeptideSequence  NormalizedRetentionTime PeptideSequence                     Precursor  PrecursorCharge  PrecursorIonMobility  PrecursorMz  ProductMz          ProteinName
+0        -1^1      0               1                    -1          NaN              -1.0      AAEDFTLLVK(UniMod:259)                     58.9      AAEDFTLLVK      AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   557.8153  AQUA4SWATH_HMLangeE
+1        b4^1      0               1                     4            b              -1.0      AAEDFTLLVK(UniMod:259)                     58.9      AAEDFTLLVK      AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   387.1510  AQUA4SWATH_HMLangeE
+2        b4^1      0               1                     4            b              -1.0      AAEDFTLLVK(UniMod:259)                     58.9      AAEDFTLLVK      AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   403.1646  AQUA4SWATH_HMLangeE
+3        b5^1      0               1                     5            b              -1.0      AAEDFTLLVK(UniMod:259)                     58.9      AAEDFTLLVK      AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   516.2486  AQUA4SWATH_HMLangeE
+4        b5^1      0               1                     5            b              -1.0      AAEDFTLLVK(UniMod:259)                     58.9      AAEDFTLLVK      AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   534.2195  AQUA4SWATH_HMLangeE
+..        ...    ...             ...                   ...          ...               ...                         ...                      ...             ...                           ...              ...                   ...          ...        ...                  ...
+95      y11^1      0               1                    11            y              -1.0  AAGASAQVLGQEGK(UniMod:259)                     -5.3  AAGASAQVLGQEGK  AAGASAQVLGQEGK(UniMod:259)_2                2                   NaN     647.8457  1119.4173      AQUA4SWATH_Spyo
+96      y11^2      0               2                    11            y              -1.0  AAGASAQVLGQEGK(UniMod:259)                     -5.3  AAGASAQVLGQEGK  AAGASAQVLGQEGK(UniMod:259)_2                2                   NaN     647.8457   548.2978      AQUA4SWATH_Spyo
+97      y11^2      0               2                    11            y              -1.0  AAGASAQVLGQEGK(UniMod:259)                     -5.3  AAGASAQVLGQEGK  AAGASAQVLGQEGK(UniMod:259)_2                2                   NaN     647.8457   560.2123      AQUA4SWATH_Spyo
+98      y12^1      0               1                    12            y              -1.0  AAGASAQVLGQEGK(UniMod:259)                     -5.3  AAGASAQVLGQEGK  AAGASAQVLGQEGK(UniMod:259)_2                2                   NaN     647.8457  1152.6099      AQUA4SWATH_Spyo
+99      y12^1      0               1                    12            y              -1.0  AAGASAQVLGQEGK(UniMod:259)                     -5.3  AAGASAQVLGQEGK  AAGASAQVLGQEGK(UniMod:259)_2                2                   NaN     647.8457  1232.5013      AQUA4SWATH_Spyo
+
+[100 rows x 15 columns]
diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True].out
new file mode 100644
index 00000000..d37d89e1
--- /dev/null
+++ b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True].out
@@ -0,0 +1,14 @@
+   Annotation  Decoy  FragmentCharge  FragmentSeriesNumber FragmentType  LibraryIntensity     ModifiedPeptideSequence  NormalizedRetentionTime PeptideSequence                     Precursor  PrecursorCharge  PrecursorIonMobility  PrecursorMz  ProductMz          ProteinName
+0        -1^1      0               1                    -1          NaN        10000.0000      AAEDFTLLVK(UniMod:259)                  62.4638      AAEDFTLLVK      AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   557.8153  AQUA4SWATH_HMLangeE
+1        b4^1      0               1                     4            b         1912.5839      AAEDFTLLVK(UniMod:259)                  62.4638      AAEDFTLLVK      AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   387.1510  AQUA4SWATH_HMLangeE
+2        b4^1      0               1                     4            b          704.8697      AAEDFTLLVK(UniMod:259)                  62.4638      AAEDFTLLVK      AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   403.1646  AQUA4SWATH_HMLangeE
+3        b5^1      0               1                     5            b         1185.3327      AAEDFTLLVK(UniMod:259)                  62.4638      AAEDFTLLVK      AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   516.2486  AQUA4SWATH_HMLangeE
+4        b5^1      0               1                     5            b          910.8388      AAEDFTLLVK(UniMod:259)                  62.4638      AAEDFTLLVK      AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   534.2195  AQUA4SWATH_HMLangeE
+..        ...    ...             ...                   ...          ...               ...                         ...                      ...             ...                           ...              ...                   ...          ...        ...                  ...
+95      y11^1      0               1                    11            y            7.9894  AAGASAQVLGQEGK(UniMod:259)                  22.1652  AAGASAQVLGQEGK  AAGASAQVLGQEGK(UniMod:259)_2                2                   NaN     647.8457  1119.4173      AQUA4SWATH_Spyo
+96      y11^2      0               2                    11            y          220.1077  AAGASAQVLGQEGK(UniMod:259)                  22.1652  AAGASAQVLGQEGK  AAGASAQVLGQEGK(UniMod:259)_2                2                   NaN     647.8457   548.2978      AQUA4SWATH_Spyo
+97      y11^2      0               2                    11            y          107.0578  AAGASAQVLGQEGK(UniMod:259)                  22.1652  AAGASAQVLGQEGK  AAGASAQVLGQEGK(UniMod:259)_2                2                   NaN     647.8457   560.2123      AQUA4SWATH_Spyo
+98      y12^1      0               1                    12            y         1006.7430  AAGASAQVLGQEGK(UniMod:259)                  22.1652  AAGASAQVLGQEGK  AAGASAQVLGQEGK(UniMod:259)_2                2                   NaN     647.8457  1152.6099      AQUA4SWATH_Spyo
+99      y12^1      0               1                    12            y            2.3968  AAGASAQVLGQEGK(UniMod:259)                  22.1652  AAGASAQVLGQEGK  AAGASAQVLGQEGK(UniMod:259)_2                2                   NaN     647.8457  1232.5013      AQUA4SWATH_Spyo
+
+[100 rows x 15 columns]
diff --git a/tests/test_pyprophet_export.py b/tests/test_pyprophet_export.py
index 761247ec..1a4910f8 100644
--- a/tests/test_pyprophet_export.py
+++ b/tests/test_pyprophet_export.py
@@ -91,7 +91,10 @@ def run_pyprophet_command(cmd, temp_folder):
         ).decode()
     except subprocess.CalledProcessError as error:
         print(f"Command failed: {cmd}\n{error.output.decode()}", file=sys.stderr)
-        raise
+        if "NotImplementedError" in error.output.decode(): # attempt to catch the specific error rather than the CalledProcessError
+            raise NotImplementedError
+        else:
+            raise 
 
 
 def validate_export_results(
@@ -151,15 +154,8 @@ def test_osw_analysis(
     "calib",
     [ True, False]
 )
-def test_osw_analysis_libExport(test_data_split_parquet, temp_folder, regtest, calib
+def test_osw_analysis_libExport(input_strategy, temp_folder, regtest, calib
 ):
-    # TODO extend to other inputs as well, for now just use split_parquet
-    input_strategy = {
-            "path": test_data_split_parquet,
-            "reader": "parquet_split",
-            "cmd_prefix": f"--in={test_data_split_parquet}",
-        }
-
     cmd = f"pyprophet score {input_strategy['cmd_prefix']} --level=ms2 --test --pi0_lambda=0.001 0 0 --ss_iteration_fdr=0.02 && "
 
     # peptide-level
@@ -174,13 +170,17 @@ def test_osw_analysis_libExport(test_data_split_parquet, temp_folder, regtest, c
     else:
         cmd += f"pyprophet export library {input_strategy['cmd_prefix']} --out={temp_folder}/test_lib.tsv --test --max_peakgroup_qvalue=1 --max_global_peptide_qvalue=1 --max_global_protein_qvalue=1 --no-rt_calibration --no-im_calibration --no-intensity_calibration"
 
-    run_pyprophet_command(cmd, temp_folder)
-    validate_export_results(
-        regtest,
-        input_strategy["path"],
-        input_strategy["reader"],
-        f"{temp_folder}/test_lib.tsv",
-    )
+    if not input_strategy["reader"] == "parquet_split":
+        with pytest.raises(NotImplementedError):
+            run_pyprophet_command(cmd, temp_folder)
+    else:
+        run_pyprophet_command(cmd, temp_folder)
+        validate_export_results(
+            regtest,
+            input_strategy["path"],
+            input_strategy["reader"],
+            f"{temp_folder}/test_lib.tsv",
+        )
 
 def test_osw_unscored(input_strategy, temp_folder, regtest):
     """Test export of unscored OSW data"""

From 609e84f3d175c52822512fa7ac80febd412c5ecf Mon Sep 17 00:00:00 2001
From: Joshua Charkow <joshuacharkow@gmail.com>
Date: Mon, 11 Aug 2025 16:46:58 -0400
Subject: [PATCH 15/25] feature: add option to export rt unit in non iRT

---
 pyprophet/_config.py    |  4 +++-
 pyprophet/cli/export.py | 10 ++++++++++
 pyprophet/io/_base.py   |  2 +-
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/pyprophet/_config.py b/pyprophet/_config.py
index 55c98a17..bec1da7f 100644
--- a/pyprophet/_config.py
+++ b/pyprophet/_config.py
@@ -671,6 +671,7 @@ class ExportIOConfig(BaseIOConfig):
         intensity_calibration (bool): If True, will use emperical intensity values as oppose to the original library intensity values
         min_fragments (int): Minimum number of fragments required to include the peak group in the library, only relevant if intensity_calibration is True
         keep_decoys (bool): Whether to keep decoy entries in the library, will only keep decoys that pass the thresholds specified
+        rt_unit (Literal["iRT", "RT"], default = 'iRT') = "iRT": Unit of retention time in the library, only relevant if rt_calibration is True. If "iRT" is selected, the retention times will be scaled to the iRT scale (0-100) in the library
     """
 
     export_format: Literal[
@@ -707,4 +708,5 @@ class ExportIOConfig(BaseIOConfig):
     im_calibration: bool = True
     intensity_calibration: bool = True
     min_fragments: int = 4
-    keep_decoys: bool = False  # Whether to keep decoy entries in the library
\ No newline at end of file
+    keep_decoys: bool = False  # Whether to keep decoy entries in the library
+    rt_unit: Literal["iRT", "RT"] = "iRT"
\ No newline at end of file
diff --git a/pyprophet/cli/export.py b/pyprophet/cli/export.py
index bc4a1bc3..60694b12 100644
--- a/pyprophet/cli/export.py
+++ b/pyprophet/cli/export.py
@@ -417,6 +417,14 @@ def export_matrix(
     type=bool,
     help="(Experimental) Whether to keep decoys in the exported library. Default is False, which means decoys are filtered out. Only keeps decoys passing thresholds specified above"
 )
+@click.option(
+    "--rt_unit",
+    default="iRT",
+    show_default=True,
+    type=click.Choice(["iRT", "RT"]),
+    help='Unit of retention time in the library, only relevant if rt_calibration is True. If "iRT" is selected, the retention times will be scaled to the iRT scale (0-100) in the library.',
+    hidden=True
+)
 @click.option(
     "--test/--no-test",
     default=False,
@@ -434,6 +442,7 @@ def export_library(
     intensity_calibration,
     min_fragments,
     keep_decoys,
+    rt_unit,
     test
 ):
     """
@@ -455,6 +464,7 @@ def export_library(
         intensity_calibration=intensity_calibration,
         min_fragments=min_fragments,
         keep_decoys=keep_decoys,
+        rt_unit=rt_unit,
         test=test
     )
 
diff --git a/pyprophet/io/_base.py b/pyprophet/io/_base.py
index e4f3e6c6..c4dcabfa 100644
--- a/pyprophet/io/_base.py
+++ b/pyprophet/io/_base.py
@@ -640,7 +640,7 @@ def clean_and_export_library(self, data: pd.DataFrame) -> pd.DataFrame:
             data['Annotation'] = data['FragmentType'] + data['FragmentSeriesNumber'].astype(str) + '^' + data['FragmentCharge'].astype(str)
 
         import sklearn.preprocessing as preprocessing
-        if cfg.rt_calibration:
+        if cfg.rt_calibration and cfg.rt_unit == "iRT":
             data['NormalizedRetentionTime'] = preprocessing.MinMaxScaler().fit_transform(data[['NormalizedRetentionTime']]) * 100
         if cfg.intensity_calibration:
             data['LibraryIntensity'] = (

From 5c6bda2d634394ab968faf40836da4e861e00f93 Mon Sep 17 00:00:00 2001
From: Joshua Charkow <joshuacharkow@gmail.com>
Date: Mon, 11 Aug 2025 16:50:25 -0400
Subject: [PATCH 16/25] remove debug line

---
 pyprophet/io/export/split_parquet.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pyprophet/io/export/split_parquet.py b/pyprophet/io/export/split_parquet.py
index 767901fe..035ae808 100644
--- a/pyprophet/io/export/split_parquet.py
+++ b/pyprophet/io/export/split_parquet.py
@@ -98,7 +98,6 @@ def _has_peptide_protein_global_scores(self) -> bool:
         """
         Check if files contain peptide and protein global scores
         """
-        print(self._columns)
         has_peptide = any(col.startswith("SCORE_PEPTIDE_GLOBAL") for col in self._columns)
         has_protein = any(col.startswith("SCORE_PROTEIN_GLOBAL") for col in self._columns)
         return has_peptide and has_protein

From bb5607c1e962f7419dd51caba9252e4eabf90ccf Mon Sep 17 00:00:00 2001
From: Joshua Charkow <joshuacharkow@gmail.com>
Date: Mon, 11 Aug 2025 16:53:14 -0400
Subject: [PATCH 17/25] swtich keep_decoys default to no-keep_decoys

---
 pyprophet/cli/export.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyprophet/cli/export.py b/pyprophet/cli/export.py
index 60694b12..2d95f8c3 100644
--- a/pyprophet/cli/export.py
+++ b/pyprophet/cli/export.py
@@ -412,7 +412,7 @@ def export_matrix(
 )
 @click.option(
     "--keep_decoys/--no-keep_decoys",
-    default=True,
+    default=False,
     show_default=True,
     type=bool,
     help="(Experimental) Whether to keep decoys in the exported library. Default is False, which means decoys are filtered out. Only keeps decoys passing thresholds specified above"

From d959d705e495466943dbd84e888212ae697280be Mon Sep 17 00:00:00 2001
From: Joshua Charkow <joshuacharkow@gmail.com>
Date: Mon, 11 Aug 2025 17:38:57 -0400
Subject: [PATCH 18/25] bug fix: transitions with diff RT

If the q values are the same for 2 different runs than behaviour is
undefined for which precursor selecting. This can mean that transitions
part of the same transition group have different RT/IM. To address this,
also sort by RunId. If Q values are the same just take the first run
---
 pyprophet/io/_base.py                | 5 +++--
 pyprophet/io/export/split_parquet.py | 3 ++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/pyprophet/io/_base.py b/pyprophet/io/_base.py
index c4dcabfa..4a6110e1 100644
--- a/pyprophet/io/_base.py
+++ b/pyprophet/io/_base.py
@@ -630,7 +630,8 @@ def clean_and_export_library(self, data: pd.DataFrame) -> pd.DataFrame:
         cfg = self.config
 
         # For precursors found in more than one run, select the run with the smallest q value
-        data = data.sort_values(by='Q_Value').groupby("TransitionId").head(1)
+        # If q values are the same, select the first run
+        data = data.sort_values(by=['Q_Value', 'RunId']).groupby("TransitionId").head(1)
         assert (len(data['TransitionId'].drop_duplicates()) == len(data))
 
         # Remove Annotation Column if all NAN
@@ -666,7 +667,7 @@ def clean_and_export_library(self, data: pd.DataFrame) -> pd.DataFrame:
         if cfg.keep_decoys:
             logger.info("Of Which {} are decoys".format(len(data[data['Decoy'] == 1]['Precursor'].drop_duplicates())))
 
-        data.drop(columns=['TransitionId', 'Q_Value'], inplace=True)
+        data.drop(columns=['TransitionId', 'Q_Value', 'RunId'], inplace=True)
         if cfg.test:
             data = data.sort_values(by=['Precursor', 'FragmentType', 'FragmentSeriesNumber', 'FragmentCharge', 'ProductMz'])
 
diff --git a/pyprophet/io/export/split_parquet.py b/pyprophet/io/export/split_parquet.py
index 035ae808..d17a75da 100644
--- a/pyprophet/io/export/split_parquet.py
+++ b/pyprophet/io/export/split_parquet.py
@@ -306,6 +306,7 @@ def _read_library_data(self, con) -> pd.DataFrame:
                 p.UNMODIFIED_SEQUENCE AS PeptideSequence,
                 p.MODIFIED_SEQUENCE AS ModifiedPeptideSequence,
                 p.PRECURSOR_CHARGE AS PrecursorCharge,
+                p.RUN_ID AS RunId,
                 (p.MODIFIED_SEQUENCE || '_' || CAST(p.PRECURSOR_CHARGE AS VARCHAR)) AS Precursor,
                 p.PRECURSOR_MZ AS PrecursorMz,
                 STRING_AGG(p.PROTEIN_ACCESSION, ';') AS ProteinName,
@@ -327,7 +328,7 @@ def _read_library_data(self, con) -> pd.DataFrame:
             GROUP BY {rt_col}, {im_col}, {intensity_col}, p.SCORE_MS2_Q_VALUE,
                      p.UNMODIFIED_SEQUENCE, p.MODIFIED_SEQUENCE, p.PRECURSOR_CHARGE,
                      p.PRECURSOR_MZ, p.FEATURE_ID, t.ANNOTATION, t.PRODUCT_MZ,
-                     t.TRANSITION_CHARGE, t.TRANSITION_TYPE, t.TRANSITION_ORDINAL, t.TRANSITION_ID, p.PRECURSOR_DECOY
+                     t.TRANSITION_CHARGE, t.TRANSITION_TYPE, t.TRANSITION_ORDINAL, t.TRANSITION_ID, p.PRECURSOR_DECOY, p.RUN_ID
         """
         return con.execute(query).fetchdf()
     

From 989ade11e768f7359b6c7afcde05a99c09d4d706 Mon Sep 17 00:00:00 2001
From: Joshua Charkow <joshuacharkow@gmail.com>
Date: Tue, 12 Aug 2025 15:03:24 -0400
Subject: [PATCH 19/25] update parameter descriptions

---
 pyprophet/cli/export.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pyprophet/cli/export.py b/pyprophet/cli/export.py
index 2d95f8c3..2162093c 100644
--- a/pyprophet/cli/export.py
+++ b/pyprophet/cli/export.py
@@ -369,21 +369,21 @@ def export_matrix(
     default=0.01,
     show_default=True,
     type=float,
-    help="Filter results to maximum run-specific peak group-level q-value, should not use values > 0.01.",
+    help="Filter results to maximum run-specific peak group-level q-value, using values greater than final statistical filtering (in most cases > 0.01), may lead to an overestimation in identification rates. If there are multiple runs with the same precursors, the run with the lowest q value is used",
 )
 @click.option(
     "--max_global_peptide_qvalue",
     default=0.01,
     show_default=True,
     type=float,
-    help="Filter results to maximum global peptide-level q-value, should not use values > 0.01.",
+    help="Filter results to maximum global peptide-level q-value, using values greater than final statistical filtering (in most cases > 0.01), may lead to an overestimation in identification rates."
 )
 @click.option(
     "--max_global_protein_qvalue",
     default=0.01,
     show_default=True,
     type=float,
-    help="Filter results to maximum global protein-level q-value, should not use values > 0.01.",
+    help="Filter results to maximum global protein-level q-value, using values greater than final statistical filtering (in most cases > 0.01), may lead to an overestimation in identification rates."
 )
 @click.option(
     "--rt_calibration/--no-rt_calibration",

From 0aae3a884d0adf9f6a6372d9628d7f84597724cb Mon Sep 17 00:00:00 2001
From: Joshua Charkow <47336288+jcharkow@users.noreply.github.com>
Date: Tue, 12 Aug 2025 15:16:04 -0400
Subject: [PATCH 20/25] fix: error description

Co-authored-by: Justin Sing <32938975+singjc@users.noreply.github.com>
---
 pyprophet/io/export/osw.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyprophet/io/export/osw.py b/pyprophet/io/export/osw.py
index db6e2455..d4520930 100644
--- a/pyprophet/io/export/osw.py
+++ b/pyprophet/io/export/osw.py
@@ -124,7 +124,7 @@ def _read_sqlite(self, con):
         cfg = self.config
 
         if self.config.export_format == "library":
-            raise NotImplementedError("Library export from non-split .parquet files is not supported")
+            raise NotImplementedError("Library export from sqlite OSW files is not supported")
  
         if self._is_unscored_file(con):
             logger.info("Reading unscored data from Parquet file.")

From c3daea1cfb18fcac9fec258608ced40c88e3a8ad Mon Sep 17 00:00:00 2001
From: Joshua Charkow <joshuacharkow@gmail.com>
Date: Tue, 12 Aug 2025 15:24:46 -0400
Subject: [PATCH 21/25] apply suggestions from PR review

---
 pyprophet/io/_base.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pyprophet/io/_base.py b/pyprophet/io/_base.py
index 4a6110e1..0e146343 100644
--- a/pyprophet/io/_base.py
+++ b/pyprophet/io/_base.py
@@ -48,6 +48,7 @@
 import duckdb
 import pandas as pd
 import polars as pl
+import sklearn.preprocessing as preprocessing
 from loguru import logger
 
 from .._base import BaseIOConfig
@@ -632,7 +633,7 @@ def clean_and_export_library(self, data: pd.DataFrame) -> pd.DataFrame:
         # For precursors found in more than one run, select the run with the smallest q value
         # If q values are the same, select the first run
         data = data.sort_values(by=['Q_Value', 'RunId']).groupby("TransitionId").head(1)
-        assert (len(data['TransitionId'].drop_duplicates()) == len(data))
+        assert (len(data['TransitionId'].drop_duplicates()) == len(data), "After filtering by Q_Value and RunId, duplicate transition IDs found.")
 
         # Remove Annotation Column if all NAN
         if data['Annotation'].isnull().all() or data['Annotation'].eq("NA").all():
@@ -640,7 +641,6 @@ def clean_and_export_library(self, data: pd.DataFrame) -> pd.DataFrame:
             data.drop(columns=['Annotation'], inplace=True)
             data['Annotation'] = data['FragmentType'] + data['FragmentSeriesNumber'].astype(str) + '^' + data['FragmentCharge'].astype(str)
 
-        import sklearn.preprocessing as preprocessing
         if cfg.rt_calibration and cfg.rt_unit == "iRT":
             data['NormalizedRetentionTime'] = preprocessing.MinMaxScaler().fit_transform(data[['NormalizedRetentionTime']]) * 100
         if cfg.intensity_calibration:
@@ -648,6 +648,7 @@ def clean_and_export_library(self, data: pd.DataFrame) -> pd.DataFrame:
             data['LibraryIntensity'] /
             data.groupby('Precursor')['LibraryIntensity'].transform('max') *
             10000)
+            logger.debug("Removing {} rows with zero intensity.".format(len(data[data['LibraryIntensity'] <= 0])))
             data = data[data['LibraryIntensity'] > 0] # Remove rows with zero intensity
         
         ## Print Library statistics

From 3a8d20333499cef72471d91236aac9166fed3def Mon Sep 17 00:00:00 2001
From: Joshua Charkow <joshuacharkow@gmail.com>
Date: Tue, 12 Aug 2025 15:45:58 -0400
Subject: [PATCH 22/25] test: update tests with new snapshots

after minor changes in data manipulation update snapshot tests
---
 ...t.test_osw_analysis_libExport[osw-False-RT].out |  0
 ....test_osw_analysis_libExport[osw-False-iRT].out |  0
 ...rt.test_osw_analysis_libExport[osw-True-RT].out |  0
 ...t.test_osw_analysis_libExport[osw-True-iRT].out |  0
 ...st_osw_analysis_libExport[parquet-False-RT].out |  0
 ...t_osw_analysis_libExport[parquet-False-iRT].out |  0
 ...est_osw_analysis_libExport[parquet-True-RT].out |  0
 ...st_osw_analysis_libExport[parquet-True-iRT].out |  0
 ..._analysis_libExport[split_parquet-False-RT].out | 14 ++++++++++++++
 ...analysis_libExport[split_parquet-False-iRT].out | 14 ++++++++++++++
 ...osw_analysis_libExport[split_parquet-False].out | 14 --------------
 ...w_analysis_libExport[split_parquet-True-RT].out | 14 ++++++++++++++
 ..._analysis_libExport[split_parquet-True-iRT].out | 14 ++++++++++++++
 ..._osw_analysis_libExport[split_parquet-True].out | 14 --------------
 tests/test_pyprophet_export.py                     | 11 ++++++-----
 15 files changed, 62 insertions(+), 33 deletions(-)
 create mode 100644 tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-False-RT].out
 create mode 100644 tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-False-iRT].out
 create mode 100644 tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-True-RT].out
 create mode 100644 tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-True-iRT].out
 create mode 100644 tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-False-RT].out
 create mode 100644 tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-False-iRT].out
 create mode 100644 tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-True-RT].out
 create mode 100644 tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-True-iRT].out
 create mode 100644 tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False-RT].out
 create mode 100644 tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False-iRT].out
 delete mode 100644 tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False].out
 create mode 100644 tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True-RT].out
 create mode 100644 tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True-iRT].out
 delete mode 100644 tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True].out

diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-False-RT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-False-RT].out
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-False-iRT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-False-iRT].out
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-True-RT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-True-RT].out
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-True-iRT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[osw-True-iRT].out
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-False-RT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-False-RT].out
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-False-iRT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-False-iRT].out
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-True-RT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-True-RT].out
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-True-iRT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[parquet-True-iRT].out
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False-RT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False-RT].out
new file mode 100644
index 00000000..37c4cd9b
--- /dev/null
+++ b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False-RT].out
@@ -0,0 +1,14 @@
+   Annotation  Decoy  FragmentCharge  FragmentSeriesNumber FragmentType  LibraryIntensity        ModifiedPeptideSequence  NormalizedRetentionTime    PeptideSequence                        Precursor  PrecursorCharge  PrecursorIonMobility  PrecursorMz  ProductMz              ProteinName
+0        -1^1      0               1                    -1          NaN              -1.0         AAEDFTLLVK(UniMod:259)                     58.9         AAEDFTLLVK         AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   557.8153      AQUA4SWATH_HMLangeE
+1        b4^1      0               1                     4            b              -1.0         AAEDFTLLVK(UniMod:259)                     58.9         AAEDFTLLVK         AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   387.1510      AQUA4SWATH_HMLangeE
+2        b5^1      0               1                     5            b              -1.0         AAEDFTLLVK(UniMod:259)                     58.9         AAEDFTLLVK         AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   534.2195      AQUA4SWATH_HMLangeE
+3        b6^1      0               1                     6            b              -1.0         AAEDFTLLVK(UniMod:259)                     58.9         AAEDFTLLVK         AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   635.2671      AQUA4SWATH_HMLangeE
+4        b7^1      0               1                     7            b              -1.0         AAEDFTLLVK(UniMod:259)                     58.9         AAEDFTLLVK         AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   748.3512      AQUA4SWATH_HMLangeE
+..        ...    ...             ...                   ...          ...               ...                            ...                      ...                ...                              ...              ...                   ...          ...        ...                      ...
+95       b4^1      0               1                     4            b              -1.0  AASEIATAELAPTHPIR(UniMod:267)                     31.5  AASEIATAELAPTHPIR  AASEIATAELAPTHPIR(UniMod:267)_2                2                   NaN     879.4746   359.1561  AQUA4SWATH_PombeSchmidt
+96       b5^1      0               1                     5            b              -1.0  AASEIATAELAPTHPIR(UniMod:267)                     31.5  AASEIATAELAPTHPIR  AASEIATAELAPTHPIR(UniMod:267)_2                2                   NaN     879.4746   472.2402  AQUA4SWATH_PombeSchmidt
+97       b6^1      0               1                     6            b              -1.0  AASEIATAELAPTHPIR(UniMod:267)                     31.5  AASEIATAELAPTHPIR  AASEIATAELAPTHPIR(UniMod:267)_2                2                   NaN     879.4746   543.2773  AQUA4SWATH_PombeSchmidt
+98       b7^1      0               1                     7            b              -1.0  AASEIATAELAPTHPIR(UniMod:267)                     31.5  AASEIATAELAPTHPIR  AASEIATAELAPTHPIR(UniMod:267)_2                2                   NaN     879.4746   644.3250  AQUA4SWATH_PombeSchmidt
+99       b8^1      0               1                     8            b              -1.0  AASEIATAELAPTHPIR(UniMod:267)                     31.5  AASEIATAELAPTHPIR  AASEIATAELAPTHPIR(UniMod:267)_2                2                   NaN     879.4746   715.3621  AQUA4SWATH_PombeSchmidt
+
+[100 rows x 15 columns]
diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False-iRT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False-iRT].out
new file mode 100644
index 00000000..37c4cd9b
--- /dev/null
+++ b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False-iRT].out
@@ -0,0 +1,14 @@
+   Annotation  Decoy  FragmentCharge  FragmentSeriesNumber FragmentType  LibraryIntensity        ModifiedPeptideSequence  NormalizedRetentionTime    PeptideSequence                        Precursor  PrecursorCharge  PrecursorIonMobility  PrecursorMz  ProductMz              ProteinName
+0        -1^1      0               1                    -1          NaN              -1.0         AAEDFTLLVK(UniMod:259)                     58.9         AAEDFTLLVK         AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   557.8153      AQUA4SWATH_HMLangeE
+1        b4^1      0               1                     4            b              -1.0         AAEDFTLLVK(UniMod:259)                     58.9         AAEDFTLLVK         AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   387.1510      AQUA4SWATH_HMLangeE
+2        b5^1      0               1                     5            b              -1.0         AAEDFTLLVK(UniMod:259)                     58.9         AAEDFTLLVK         AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   534.2195      AQUA4SWATH_HMLangeE
+3        b6^1      0               1                     6            b              -1.0         AAEDFTLLVK(UniMod:259)                     58.9         AAEDFTLLVK         AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   635.2671      AQUA4SWATH_HMLangeE
+4        b7^1      0               1                     7            b              -1.0         AAEDFTLLVK(UniMod:259)                     58.9         AAEDFTLLVK         AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   748.3512      AQUA4SWATH_HMLangeE
+..        ...    ...             ...                   ...          ...               ...                            ...                      ...                ...                              ...              ...                   ...          ...        ...                      ...
+95       b4^1      0               1                     4            b              -1.0  AASEIATAELAPTHPIR(UniMod:267)                     31.5  AASEIATAELAPTHPIR  AASEIATAELAPTHPIR(UniMod:267)_2                2                   NaN     879.4746   359.1561  AQUA4SWATH_PombeSchmidt
+96       b5^1      0               1                     5            b              -1.0  AASEIATAELAPTHPIR(UniMod:267)                     31.5  AASEIATAELAPTHPIR  AASEIATAELAPTHPIR(UniMod:267)_2                2                   NaN     879.4746   472.2402  AQUA4SWATH_PombeSchmidt
+97       b6^1      0               1                     6            b              -1.0  AASEIATAELAPTHPIR(UniMod:267)                     31.5  AASEIATAELAPTHPIR  AASEIATAELAPTHPIR(UniMod:267)_2                2                   NaN     879.4746   543.2773  AQUA4SWATH_PombeSchmidt
+98       b7^1      0               1                     7            b              -1.0  AASEIATAELAPTHPIR(UniMod:267)                     31.5  AASEIATAELAPTHPIR  AASEIATAELAPTHPIR(UniMod:267)_2                2                   NaN     879.4746   644.3250  AQUA4SWATH_PombeSchmidt
+99       b8^1      0               1                     8            b              -1.0  AASEIATAELAPTHPIR(UniMod:267)                     31.5  AASEIATAELAPTHPIR  AASEIATAELAPTHPIR(UniMod:267)_2                2                   NaN     879.4746   715.3621  AQUA4SWATH_PombeSchmidt
+
+[100 rows x 15 columns]
diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False].out
deleted file mode 100644
index f3e17a61..00000000
--- a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-False].out
+++ /dev/null
@@ -1,14 +0,0 @@
-   Annotation  Decoy  FragmentCharge  FragmentSeriesNumber FragmentType  LibraryIntensity     ModifiedPeptideSequence  NormalizedRetentionTime PeptideSequence                     Precursor  PrecursorCharge  PrecursorIonMobility  PrecursorMz  ProductMz          ProteinName
-0        -1^1      0               1                    -1          NaN              -1.0      AAEDFTLLVK(UniMod:259)                     58.9      AAEDFTLLVK      AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   557.8153  AQUA4SWATH_HMLangeE
-1        b4^1      0               1                     4            b              -1.0      AAEDFTLLVK(UniMod:259)                     58.9      AAEDFTLLVK      AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   387.1510  AQUA4SWATH_HMLangeE
-2        b4^1      0               1                     4            b              -1.0      AAEDFTLLVK(UniMod:259)                     58.9      AAEDFTLLVK      AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   403.1646  AQUA4SWATH_HMLangeE
-3        b5^1      0               1                     5            b              -1.0      AAEDFTLLVK(UniMod:259)                     58.9      AAEDFTLLVK      AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   516.2486  AQUA4SWATH_HMLangeE
-4        b5^1      0               1                     5            b              -1.0      AAEDFTLLVK(UniMod:259)                     58.9      AAEDFTLLVK      AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   534.2195  AQUA4SWATH_HMLangeE
-..        ...    ...             ...                   ...          ...               ...                         ...                      ...             ...                           ...              ...                   ...          ...        ...                  ...
-95      y11^1      0               1                    11            y              -1.0  AAGASAQVLGQEGK(UniMod:259)                     -5.3  AAGASAQVLGQEGK  AAGASAQVLGQEGK(UniMod:259)_2                2                   NaN     647.8457  1119.4173      AQUA4SWATH_Spyo
-96      y11^2      0               2                    11            y              -1.0  AAGASAQVLGQEGK(UniMod:259)                     -5.3  AAGASAQVLGQEGK  AAGASAQVLGQEGK(UniMod:259)_2                2                   NaN     647.8457   548.2978      AQUA4SWATH_Spyo
-97      y11^2      0               2                    11            y              -1.0  AAGASAQVLGQEGK(UniMod:259)                     -5.3  AAGASAQVLGQEGK  AAGASAQVLGQEGK(UniMod:259)_2                2                   NaN     647.8457   560.2123      AQUA4SWATH_Spyo
-98      y12^1      0               1                    12            y              -1.0  AAGASAQVLGQEGK(UniMod:259)                     -5.3  AAGASAQVLGQEGK  AAGASAQVLGQEGK(UniMod:259)_2                2                   NaN     647.8457  1152.6099      AQUA4SWATH_Spyo
-99      y12^1      0               1                    12            y              -1.0  AAGASAQVLGQEGK(UniMod:259)                     -5.3  AAGASAQVLGQEGK  AAGASAQVLGQEGK(UniMod:259)_2                2                   NaN     647.8457  1232.5013      AQUA4SWATH_Spyo
-
-[100 rows x 15 columns]
diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True-RT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True-RT].out
new file mode 100644
index 00000000..5d75e9c9
--- /dev/null
+++ b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True-RT].out
@@ -0,0 +1,14 @@
+   Annotation  Decoy  FragmentCharge  FragmentSeriesNumber FragmentType  LibraryIntensity        ModifiedPeptideSequence  NormalizedRetentionTime    PeptideSequence                        Precursor  PrecursorCharge  PrecursorIonMobility  PrecursorMz  ProductMz              ProteinName
+0        -1^1      0               1                    -1          NaN        10000.0000         AAEDFTLLVK(UniMod:259)                  3665.82         AAEDFTLLVK         AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   557.8153      AQUA4SWATH_HMLangeE
+1        b4^1      0               1                     4            b         1912.5839         AAEDFTLLVK(UniMod:259)                  3665.82         AAEDFTLLVK         AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   387.1510      AQUA4SWATH_HMLangeE
+2        b5^1      0               1                     5            b          910.8388         AAEDFTLLVK(UniMod:259)                  3665.82         AAEDFTLLVK         AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   534.2195      AQUA4SWATH_HMLangeE
+3        b6^1      0               1                     6            b          763.6335         AAEDFTLLVK(UniMod:259)                  3665.82         AAEDFTLLVK         AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   635.2671      AQUA4SWATH_HMLangeE
+4        b7^1      0               1                     7            b          619.6819         AAEDFTLLVK(UniMod:259)                  3665.82         AAEDFTLLVK         AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   748.3512      AQUA4SWATH_HMLangeE
+..        ...    ...             ...                   ...          ...               ...                            ...                      ...                ...                              ...              ...                   ...          ...        ...                      ...
+95       b4^1      0               1                     4            b         4293.9906  AASEIATAELAPTHPIR(UniMod:267)                  2754.99  AASEIATAELAPTHPIR  AASEIATAELAPTHPIR(UniMod:267)_2                2                   NaN     879.4746   359.1561  AQUA4SWATH_PombeSchmidt
+96       b5^1      0               1                     5            b         2245.5035  AASEIATAELAPTHPIR(UniMod:267)                  2754.99  AASEIATAELAPTHPIR  AASEIATAELAPTHPIR(UniMod:267)_2                2                   NaN     879.4746   472.2402  AQUA4SWATH_PombeSchmidt
+97       b6^1      0               1                     6            b         1169.3817  AASEIATAELAPTHPIR(UniMod:267)                  2754.99  AASEIATAELAPTHPIR  AASEIATAELAPTHPIR(UniMod:267)_2                2                   NaN     879.4746   543.2773  AQUA4SWATH_PombeSchmidt
+98       b7^1      0               1                     7            b          796.7460  AASEIATAELAPTHPIR(UniMod:267)                  2754.99  AASEIATAELAPTHPIR  AASEIATAELAPTHPIR(UniMod:267)_2                2                   NaN     879.4746   644.3250  AQUA4SWATH_PombeSchmidt
+99       b8^1      0               1                     8            b          616.6858  AASEIATAELAPTHPIR(UniMod:267)                  2754.99  AASEIATAELAPTHPIR  AASEIATAELAPTHPIR(UniMod:267)_2                2                   NaN     879.4746   715.3621  AQUA4SWATH_PombeSchmidt
+
+[100 rows x 15 columns]
diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True-iRT].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True-iRT].out
new file mode 100644
index 00000000..8f3d8b08
--- /dev/null
+++ b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True-iRT].out
@@ -0,0 +1,14 @@
+   Annotation  Decoy  FragmentCharge  FragmentSeriesNumber FragmentType  LibraryIntensity        ModifiedPeptideSequence  NormalizedRetentionTime    PeptideSequence                        Precursor  PrecursorCharge  PrecursorIonMobility  PrecursorMz  ProductMz              ProteinName
+0        -1^1      0               1                    -1          NaN        10000.0000         AAEDFTLLVK(UniMod:259)                  62.4638         AAEDFTLLVK         AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   557.8153      AQUA4SWATH_HMLangeE
+1        b4^1      0               1                     4            b         1912.5839         AAEDFTLLVK(UniMod:259)                  62.4638         AAEDFTLLVK         AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   387.1510      AQUA4SWATH_HMLangeE
+2        b5^1      0               1                     5            b          910.8388         AAEDFTLLVK(UniMod:259)                  62.4638         AAEDFTLLVK         AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   534.2195      AQUA4SWATH_HMLangeE
+3        b6^1      0               1                     6            b          763.6335         AAEDFTLLVK(UniMod:259)                  62.4638         AAEDFTLLVK         AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   635.2671      AQUA4SWATH_HMLangeE
+4        b7^1      0               1                     7            b          619.6819         AAEDFTLLVK(UniMod:259)                  62.4638         AAEDFTLLVK         AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   748.3512      AQUA4SWATH_HMLangeE
+..        ...    ...             ...                   ...          ...               ...                            ...                      ...                ...                              ...              ...                   ...          ...        ...                      ...
+95       b4^1      0               1                     4            b         4293.9906  AASEIATAELAPTHPIR(UniMod:267)                  45.3495  AASEIATAELAPTHPIR  AASEIATAELAPTHPIR(UniMod:267)_2                2                   NaN     879.4746   359.1561  AQUA4SWATH_PombeSchmidt
+96       b5^1      0               1                     5            b         2245.5035  AASEIATAELAPTHPIR(UniMod:267)                  45.3495  AASEIATAELAPTHPIR  AASEIATAELAPTHPIR(UniMod:267)_2                2                   NaN     879.4746   472.2402  AQUA4SWATH_PombeSchmidt
+97       b6^1      0               1                     6            b         1169.3817  AASEIATAELAPTHPIR(UniMod:267)                  45.3495  AASEIATAELAPTHPIR  AASEIATAELAPTHPIR(UniMod:267)_2                2                   NaN     879.4746   543.2773  AQUA4SWATH_PombeSchmidt
+98       b7^1      0               1                     7            b          796.7460  AASEIATAELAPTHPIR(UniMod:267)                  45.3495  AASEIATAELAPTHPIR  AASEIATAELAPTHPIR(UniMod:267)_2                2                   NaN     879.4746   644.3250  AQUA4SWATH_PombeSchmidt
+99       b8^1      0               1                     8            b          616.6858  AASEIATAELAPTHPIR(UniMod:267)                  45.3495  AASEIATAELAPTHPIR  AASEIATAELAPTHPIR(UniMod:267)_2                2                   NaN     879.4746   715.3621  AQUA4SWATH_PombeSchmidt
+
+[100 rows x 15 columns]
diff --git a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True].out b/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True].out
deleted file mode 100644
index d37d89e1..00000000
--- a/tests/_regtest_outputs/test_pyprophet_export.test_osw_analysis_libExport[split_parquet-True].out
+++ /dev/null
@@ -1,14 +0,0 @@
-   Annotation  Decoy  FragmentCharge  FragmentSeriesNumber FragmentType  LibraryIntensity     ModifiedPeptideSequence  NormalizedRetentionTime PeptideSequence                     Precursor  PrecursorCharge  PrecursorIonMobility  PrecursorMz  ProductMz          ProteinName
-0        -1^1      0               1                    -1          NaN        10000.0000      AAEDFTLLVK(UniMod:259)                  62.4638      AAEDFTLLVK      AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   557.8153  AQUA4SWATH_HMLangeE
-1        b4^1      0               1                     4            b         1912.5839      AAEDFTLLVK(UniMod:259)                  62.4638      AAEDFTLLVK      AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   387.1510  AQUA4SWATH_HMLangeE
-2        b4^1      0               1                     4            b          704.8697      AAEDFTLLVK(UniMod:259)                  62.4638      AAEDFTLLVK      AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   403.1646  AQUA4SWATH_HMLangeE
-3        b5^1      0               1                     5            b         1185.3327      AAEDFTLLVK(UniMod:259)                  62.4638      AAEDFTLLVK      AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   516.2486  AQUA4SWATH_HMLangeE
-4        b5^1      0               1                     5            b          910.8388      AAEDFTLLVK(UniMod:259)                  62.4638      AAEDFTLLVK      AAEDFTLLVK(UniMod:259)_2                2                   NaN     557.8153   534.2195  AQUA4SWATH_HMLangeE
-..        ...    ...             ...                   ...          ...               ...                         ...                      ...             ...                           ...              ...                   ...          ...        ...                  ...
-95      y11^1      0               1                    11            y            7.9894  AAGASAQVLGQEGK(UniMod:259)                  22.1652  AAGASAQVLGQEGK  AAGASAQVLGQEGK(UniMod:259)_2                2                   NaN     647.8457  1119.4173      AQUA4SWATH_Spyo
-96      y11^2      0               2                    11            y          220.1077  AAGASAQVLGQEGK(UniMod:259)                  22.1652  AAGASAQVLGQEGK  AAGASAQVLGQEGK(UniMod:259)_2                2                   NaN     647.8457   548.2978      AQUA4SWATH_Spyo
-97      y11^2      0               2                    11            y          107.0578  AAGASAQVLGQEGK(UniMod:259)                  22.1652  AAGASAQVLGQEGK  AAGASAQVLGQEGK(UniMod:259)_2                2                   NaN     647.8457   560.2123      AQUA4SWATH_Spyo
-98      y12^1      0               1                    12            y         1006.7430  AAGASAQVLGQEGK(UniMod:259)                  22.1652  AAGASAQVLGQEGK  AAGASAQVLGQEGK(UniMod:259)_2                2                   NaN     647.8457  1152.6099      AQUA4SWATH_Spyo
-99      y12^1      0               1                    12            y            2.3968  AAGASAQVLGQEGK(UniMod:259)                  22.1652  AAGASAQVLGQEGK  AAGASAQVLGQEGK(UniMod:259)_2                2                   NaN     647.8457  1232.5013      AQUA4SWATH_Spyo
-
-[100 rows x 15 columns]
diff --git a/tests/test_pyprophet_export.py b/tests/test_pyprophet_export.py
index 1a4910f8..6e05f8c3 100644
--- a/tests/test_pyprophet_export.py
+++ b/tests/test_pyprophet_export.py
@@ -151,10 +151,10 @@ def test_osw_analysis(
     )
 
 @pytest.mark.parametrize(
-    "calib",
-    [ True, False]
+    "calib, rt_unit",
+    [ (True, 'iRT'), (False, 'iRT'), (True, 'RT'), (False, 'RT')]
 )
-def test_osw_analysis_libExport(input_strategy, temp_folder, regtest, calib
+def test_osw_analysis_libExport(input_strategy, temp_folder, regtest, calib, rt_unit
 ):
     cmd = f"pyprophet score {input_strategy['cmd_prefix']} --level=ms2 --test --pi0_lambda=0.001 0 0 --ss_iteration_fdr=0.02 && "
 
@@ -164,11 +164,12 @@ def test_osw_analysis_libExport(input_strategy, temp_folder, regtest, calib
     # protein-level
     cmd += f"pyprophet infer protein --pi0_lambda=0 0 0 {input_strategy['cmd_prefix']} --context=global && "
 
+
     # export
     if calib:
-        cmd += f"pyprophet export library {input_strategy['cmd_prefix']} --out={temp_folder}/test_lib.tsv --test --max_peakgroup_qvalue=1 --max_global_peptide_qvalue=1 --max_global_protein_qvalue=1"
+        cmd += f"pyprophet export library {input_strategy['cmd_prefix']} --out={temp_folder}/test_lib.tsv --test --max_peakgroup_qvalue=1 --max_global_peptide_qvalue=1 --max_global_protein_qvalue=1 --rt_unit={rt_unit}"
     else:
-        cmd += f"pyprophet export library {input_strategy['cmd_prefix']} --out={temp_folder}/test_lib.tsv --test --max_peakgroup_qvalue=1 --max_global_peptide_qvalue=1 --max_global_protein_qvalue=1 --no-rt_calibration --no-im_calibration --no-intensity_calibration"
+        cmd += f"pyprophet export library {input_strategy['cmd_prefix']} --out={temp_folder}/test_lib.tsv --test --max_peakgroup_qvalue=1 --max_global_peptide_qvalue=1 --max_global_protein_qvalue=1 --no-rt_calibration --no-im_calibration --no-intensity_calibration --rt_unit={rt_unit}"
 
     if not input_strategy["reader"] == "parquet_split":
         with pytest.raises(NotImplementedError):

From 76ac52c32eca2624e56ea92bc94b105ebadc0ae7 Mon Sep 17 00:00:00 2001
From: Joshua Charkow <joshuacharkow@gmail.com>
Date: Tue, 19 Aug 2025 16:07:46 -0400
Subject: [PATCH 23/25] feature: sort by intensity if q value tie

if still a tie sort by runId
---
 pyprophet/io/_base.py                | 4 ++--
 pyprophet/io/export/split_parquet.py | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/pyprophet/io/_base.py b/pyprophet/io/_base.py
index 0e146343..878075d7 100644
--- a/pyprophet/io/_base.py
+++ b/pyprophet/io/_base.py
@@ -632,7 +632,7 @@ def clean_and_export_library(self, data: pd.DataFrame) -> pd.DataFrame:
 
         # For precursors found in more than one run, select the run with the smallest q value
         # If q values are the same, select the first run
-        data = data.sort_values(by=['Q_Value', 'RunId']).groupby("TransitionId").head(1)
+        data = data.sort_values(by=['Q_Value', 'Intensity', 'RunId']).groupby("TransitionId").head(1)
         assert (len(data['TransitionId'].drop_duplicates()) == len(data), "After filtering by Q_Value and RunId, duplicate transition IDs found.")
 
         # Remove Annotation Column if all NAN
@@ -668,7 +668,7 @@ def clean_and_export_library(self, data: pd.DataFrame) -> pd.DataFrame:
         if cfg.keep_decoys:
             logger.info("Of Which {} are decoys".format(len(data[data['Decoy'] == 1]['Precursor'].drop_duplicates())))
 
-        data.drop(columns=['TransitionId', 'Q_Value', 'RunId'], inplace=True)
+        data.drop(columns=['TransitionId', 'Q_Value', 'RunId', 'Intensity'], inplace=True)
         if cfg.test:
             data = data.sort_values(by=['Precursor', 'FragmentType', 'FragmentSeriesNumber', 'FragmentCharge', 'ProductMz'])
 
diff --git a/pyprophet/io/export/split_parquet.py b/pyprophet/io/export/split_parquet.py
index d17a75da..2f6812b1 100644
--- a/pyprophet/io/export/split_parquet.py
+++ b/pyprophet/io/export/split_parquet.py
@@ -306,6 +306,7 @@ def _read_library_data(self, con) -> pd.DataFrame:
                 p.UNMODIFIED_SEQUENCE AS PeptideSequence,
                 p.MODIFIED_SEQUENCE AS ModifiedPeptideSequence,
                 p.PRECURSOR_CHARGE AS PrecursorCharge,
+                p.FEATURE_MS2_AREA_INTENSITY AS Intensity,
                 p.RUN_ID AS RunId,
                 (p.MODIFIED_SEQUENCE || '_' || CAST(p.PRECURSOR_CHARGE AS VARCHAR)) AS Precursor,
                 p.PRECURSOR_MZ AS PrecursorMz,

From 1c32cb6f404135c3b37f1b866185413f0e73878b Mon Sep 17 00:00:00 2001
From: Joshua Charkow <joshuacharkow@gmail.com>
Date: Tue, 19 Aug 2025 16:15:45 -0400
Subject: [PATCH 24/25] apply copilot suggestions

---
 pyprophet/io/_base.py                | 7 ++++---
 pyprophet/io/export/split_parquet.py | 8 ++++++--
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/pyprophet/io/_base.py b/pyprophet/io/_base.py
index 878075d7..23aa3680 100644
--- a/pyprophet/io/_base.py
+++ b/pyprophet/io/_base.py
@@ -48,7 +48,7 @@
 import duckdb
 import pandas as pd
 import polars as pl
-import sklearn.preprocessing as preprocessing
+import sklearn.preprocessing as preprocessing # For MinMaxScaler
 from loguru import logger
 
 from .._base import BaseIOConfig
@@ -633,7 +633,7 @@ def clean_and_export_library(self, data: pd.DataFrame) -> pd.DataFrame:
         # For precursors found in more than one run, select the run with the smallest q value
         # If q values are the same, select the first run
         data = data.sort_values(by=['Q_Value', 'Intensity', 'RunId']).groupby("TransitionId").head(1)
-        assert (len(data['TransitionId'].drop_duplicates()) == len(data), "After filtering by Q_Value and RunId, duplicate transition IDs found.")
+        assert len(data['TransitionId'].drop_duplicates()) == len(data), "After filtering by Q_Value Intensity and RunId, duplicate transition IDs found."
 
         # Remove Annotation Column if all NAN
         if data['Annotation'].isnull().all() or data['Annotation'].eq("NA").all():
@@ -649,7 +649,8 @@ def clean_and_export_library(self, data: pd.DataFrame) -> pd.DataFrame:
             data.groupby('Precursor')['LibraryIntensity'].transform('max') *
             10000)
             logger.debug("Removing {} rows with zero intensity.".format(len(data[data['LibraryIntensity'] <= 0])))
-            data = data[data['LibraryIntensity'] > 0] # Remove rows with zero intensity
+            # Remove rows with zero intensity
+            data = data[data['LibraryIntensity'] > 0] 
         
         ## Print Library statistics
         logger.info(f"Library Contains {len(data['Precursor'].drop_duplicates())} Precursors")
diff --git a/pyprophet/io/export/split_parquet.py b/pyprophet/io/export/split_parquet.py
index 2f6812b1..0cbf9732 100644
--- a/pyprophet/io/export/split_parquet.py
+++ b/pyprophet/io/export/split_parquet.py
@@ -70,9 +70,13 @@ def read(self) -> pd.DataFrame:
 
             if self.config.export_format == "library":
                 if self._is_unscored_file():
-                    raise logger.exception("Files must be scored for library generation.")
+                    descr= "Files must be scored for library generation."
+                    logger.exception(descr)
+                    raise ValueError(descr)
                 if not self._has_peptide_protein_global_scores():
-                    raise logger.exception("Files must have peptide and protein level global scores for library generation.")
+                    descr= "Files must have peptide and protein level global scores for library generation."
+                    logger.exception(descr)
+                    raise ValueError(descr)
                 logger.info("Reading standard OpenSWATH data for library from split Parquet files.")
                 return self._read_library_data(con)
 

From 0c0d99dce09eeacba13a0fdde4dea9e8dc17d71c Mon Sep 17 00:00:00 2001
From: Joshua Charkow <joshuacharkow@gmail.com>
Date: Thu, 28 Aug 2025 10:00:49 -0400
Subject: [PATCH 25/25] fix: bug in sql query

---
 pyprophet/io/export/split_parquet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyprophet/io/export/split_parquet.py b/pyprophet/io/export/split_parquet.py
index 0cbf9732..265130a8 100644
--- a/pyprophet/io/export/split_parquet.py
+++ b/pyprophet/io/export/split_parquet.py
@@ -333,7 +333,7 @@ def _read_library_data(self, con) -> pd.DataFrame:
             GROUP BY {rt_col}, {im_col}, {intensity_col}, p.SCORE_MS2_Q_VALUE,
                      p.UNMODIFIED_SEQUENCE, p.MODIFIED_SEQUENCE, p.PRECURSOR_CHARGE,
                      p.PRECURSOR_MZ, p.FEATURE_ID, t.ANNOTATION, t.PRODUCT_MZ,
-                     t.TRANSITION_CHARGE, t.TRANSITION_TYPE, t.TRANSITION_ORDINAL, t.TRANSITION_ID, p.PRECURSOR_DECOY, p.RUN_ID
+                     t.TRANSITION_CHARGE, t.TRANSITION_TYPE, t.TRANSITION_ORDINAL, t.TRANSITION_ID, p.PRECURSOR_DECOY, p.RUN_ID, p.FEATURE_MS2_AREA_INTENSITY
         """
         return con.execute(query).fetchdf()