From bb42090f79062534a259315b14d8cb63f2b9ce32 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Tue, 8 Apr 2025 11:21:39 -0700 Subject: [PATCH 1/2] refactor: Move Peaks from AOS to SOA --- crates/sage-cli/src/output.rs | 77 ++++--------------- crates/sage-cli/src/runner.rs | 32 ++++---- crates/sage-cloudpath/src/tdf.rs | 4 +- crates/sage/src/fdr.rs | 2 +- crates/sage/src/lfq.rs | 122 +++++++++++-------------------- crates/sage/src/modification.rs | 2 +- crates/sage/src/scoring.rs | 78 +++++++++++++------- crates/sage/src/spectrum.rs | 86 +++++++++++----------- crates/sage/src/tmt.rs | 34 ++++++--- 9 files changed, 192 insertions(+), 245 deletions(-) diff --git a/crates/sage-cli/src/output.rs b/crates/sage-cli/src/output.rs index 1efdcaab..59cdaac1 100644 --- a/crates/sage-cli/src/output.rs +++ b/crates/sage-cli/src/output.rs @@ -1,14 +1,23 @@ use rayon::prelude::*; -use sage_core::spectrum::MS1Spectra; +use sage_core::spectrum::ProcessedSpectrum; use sage_core::{scoring::Feature, tmt::TmtQuant}; #[derive(Default)] pub struct SageResults { - pub ms1: MS1Spectra, + pub ms1: Vec, pub features: Vec, pub quant: Vec, } +impl SageResults { + fn fold(mut self, other: SageResults) -> Self { + self.features.extend(other.features); + self.quant.extend(other.quant); + self.ms1.extend(other.ms1); + self + } +} + impl FromParallelIterator for SageResults { fn from_par_iter(par_iter: I) -> Self where @@ -16,39 +25,7 @@ impl FromParallelIterator for SageResults { { par_iter .into_par_iter() - .reduce(SageResults::default, |mut acc, x| { - acc.features.extend(x.features); - acc.quant.extend(x.quant); - match (acc.ms1, x.ms1) { - (MS1Spectra::NoMobility(mut a), MS1Spectra::NoMobility(b)) => { - a.extend(b); - acc.ms1 = MS1Spectra::NoMobility(a); - } - (MS1Spectra::WithMobility(mut a), MS1Spectra::WithMobility(b)) => { - a.extend(b); - acc.ms1 = MS1Spectra::WithMobility(a); - } - (MS1Spectra::Empty, MS1Spectra::Empty) => { - acc.ms1 = MS1Spectra::Empty; - } - (MS1Spectra::Empty, MS1Spectra::WithMobility(a)) - | (MS1Spectra::WithMobility(a), MS1Spectra::Empty) => { - acc.ms1 = MS1Spectra::WithMobility(a); - } - (MS1Spectra::Empty, MS1Spectra::NoMobility(a)) - | (MS1Spectra::NoMobility(a), MS1Spectra::Empty) => { - acc.ms1 = MS1Spectra::NoMobility(a); - } - _ => { - // In theory this can happen if someone is searching - // together files of different types, mixing the ones - // that support IMS and the ones that dont. - // ... I dont think this should be run-time recoverable. - unreachable!("Found combination of MS1 spectra with and without mobility.") - } - }; - acc - }) + .reduce(SageResults::default, SageResults::fold) } } @@ -59,34 +36,6 @@ impl FromIterator for SageResults { { par_iter .into_iter() - .fold(SageResults::default(), |mut acc, x| { - acc.features.extend(x.features); - acc.quant.extend(x.quant); - match (acc.ms1, x.ms1) { - (MS1Spectra::NoMobility(mut a), MS1Spectra::NoMobility(b)) => { - a.extend(b); - acc.ms1 = MS1Spectra::NoMobility(a); - } - (MS1Spectra::WithMobility(mut a), MS1Spectra::WithMobility(b)) => { - a.extend(b); - acc.ms1 = MS1Spectra::WithMobility(a); - } - (MS1Spectra::Empty, MS1Spectra::Empty) => { - acc.ms1 = MS1Spectra::Empty; - } - (MS1Spectra::Empty, MS1Spectra::WithMobility(a)) - | (MS1Spectra::WithMobility(a), MS1Spectra::Empty) => { - acc.ms1 = MS1Spectra::WithMobility(a); - } - (MS1Spectra::Empty, MS1Spectra::NoMobility(a)) - | (MS1Spectra::NoMobility(a), MS1Spectra::Empty) => { - acc.ms1 = MS1Spectra::NoMobility(a); - } - _ => { - unreachable!("Found combination of MS1 spectra with and without mobility.") - } - }; - acc - }) + .fold(SageResults::default(), SageResults::fold) } } diff --git a/crates/sage-cli/src/runner.rs b/crates/sage-cli/src/runner.rs index 40915112..259f5e4f 100644 --- a/crates/sage-cli/src/runner.rs +++ b/crates/sage-cli/src/runner.rs @@ -14,7 +14,7 @@ use sage_core::mass::Tolerance; use sage_core::peptide::Peptide; use sage_core::scoring::Fragments; use sage_core::scoring::{Feature, Scorer}; -use sage_core::spectrum::{MS1Spectra, ProcessedSpectrum, RawSpectrum, SpectrumProcessor}; +use sage_core::spectrum::{ProcessedSpectrum, RawSpectrum, SpectrumProcessor}; use sage_core::tmt::TmtQuant; use std::collections::HashMap; use std::collections::HashSet; @@ -136,8 +136,8 @@ impl Runner { pub fn prefilter_peptides(self, parallel: usize, fasta: Fasta) -> Vec { let spectra: Option<( - MS1Spectra, - Vec>, + Vec, + Vec, )> = match parallel >= self.parameters.mzml_paths.len() { true => Some(self.read_processed_spectra(&self.parameters.mzml_paths, 0, 0)), false => None, @@ -207,7 +207,7 @@ impl Runner { fn peptide_filter_processed_spectra( &self, scorer: &Scorer, - spectra: &Vec>, + spectra: &Vec, ) -> Vec { use std::sync::atomic::{AtomicUsize, Ordering}; let counter = AtomicUsize::new(0); @@ -215,7 +215,7 @@ impl Runner { let peptide_idxs: Vec<_> = spectra .par_iter() - .filter(|spec| spec.peaks.len() >= self.parameters.min_peaks && spec.level == 2) + .filter(|spec| spec.masses.len() >= self.parameters.min_peaks && spec.level == 2) .map(|x| { let prev = counter.fetch_add(1, Ordering::Relaxed); if prev > 0 && prev % 10_000 == 0 { @@ -262,7 +262,7 @@ impl Runner { fn search_processed_spectra( &self, scorer: &Scorer, - msn_spectra: &Vec>, + msn_spectra: &Vec, ) -> Vec { use std::sync::atomic::{AtomicUsize, Ordering}; let counter = AtomicUsize::new(0); @@ -270,7 +270,7 @@ impl Runner { let features: Vec<_> = msn_spectra .par_iter() - .filter(|spec| spec.peaks.len() >= self.parameters.min_peaks && spec.level == 2) + .filter(|spec| spec.masses.len() >= self.parameters.min_peaks && spec.level == 2) .map(|x| { let prev = counter.fetch_add(1, Ordering::Relaxed); if prev > 0 && prev % 10_000 == 0 { @@ -293,8 +293,8 @@ impl Runner { fn complete_features( &self, - msn_spectra: Vec>, - ms1_spectra: MS1Spectra, + msn_spectra: Vec, + ms1_spectra: Vec, features: Vec, ) -> SageResults { let quant = self @@ -340,8 +340,8 @@ impl Runner { chunk_idx: usize, batch_size: usize, ) -> ( - MS1Spectra, - Vec>, + Vec, + Vec, ) { // Read all of the spectra at once - this can help prevent memory over-consumption issues info!( @@ -430,19 +430,17 @@ impl Runner { let ms1_empty = spectra.ms1.is_empty(); let ms1_spectra = if ms1_empty { log::trace!("no MS1 spectra found"); - MS1Spectra::Empty + vec![] } else if all_contain_ims { log::trace!("Processing MS1 spectra with IMS"); - let spectra = spectra + spectra .ms1 .into_iter() .map(|x| sp.process_with_mobility(x)) - .collect(); - MS1Spectra::WithMobility(spectra) + .collect() } else { log::trace!("Processing MS1 spectra without IMS"); - let spectra = spectra.ms1.into_iter().map(|s| sp.process(s)).collect(); - MS1Spectra::NoMobility(spectra) + spectra.ms1.into_iter().map(|s| sp.process(s)).collect() }; let io_time = Instant::now() - start; diff --git a/crates/sage-cloudpath/src/tdf.rs b/crates/sage-cloudpath/src/tdf.rs index 57290408..9d9ca706 100644 --- a/crates/sage-cloudpath/src/tdf.rs +++ b/crates/sage-cloudpath/src/tdf.rs @@ -221,7 +221,7 @@ impl PeakBuffer { // The "order" is sorted by intensity // This will be used later during the centroiding (for details check that implementation) - self.order.extend((0..self.len())); + self.order.extend(0..self.len()); self.order.sort_unstable_by(|&a, &b| { self.peaks[b] .intensity @@ -375,7 +375,7 @@ impl PeakBuffer { global_num_included += num_includable; if global_num_included == self.len() { - log::debug!("All peaks were included in the centroiding"); + log::trace!("All peaks were included in the centroiding"); break; } } diff --git a/crates/sage/src/fdr.rs b/crates/sage/src/fdr.rs index 75658a38..554ee7b9 100644 --- a/crates/sage/src/fdr.rs +++ b/crates/sage/src/fdr.rs @@ -235,7 +235,7 @@ pub fn picked_precursor( .map(|score| ((score.ix, score.decoy), score.q)) .collect::>(); - peaks.par_iter_mut().for_each(|((ix), (peak, _))| { + peaks.par_iter_mut().for_each(|(ix, (peak, _))| { peak.q_value = scores[ix]; }); passing diff --git a/crates/sage/src/lfq.rs b/crates/sage/src/lfq.rs index 0a0ef60f..8618b5e5 100644 --- a/crates/sage/src/lfq.rs +++ b/crates/sage/src/lfq.rs @@ -2,7 +2,7 @@ use crate::database::{binary_search_slice, IndexedDatabase, PeptideIx}; use crate::mass::{composition, Composition, Tolerance, NEUTRON}; use crate::ml::{matrix::Matrix, retention_alignment::Alignment}; use crate::scoring::Feature; -use crate::spectrum::MS1Spectra; +use crate::spectrum::ProcessedSpectrum; use dashmap::DashMap; use rayon::prelude::*; use serde::{Deserialize, Serialize}; @@ -224,7 +224,7 @@ impl FeatureMap { pub fn quantify( &self, db: &IndexedDatabase, - spectra: &MS1Spectra, + spectra: &Vec, alignments: &[Alignment], ) -> HashMap<(PrecursorId, bool), (Peak, Vec), fnv::FnvBuildHasher> { let scores: DashMap<(PrecursorId, bool), Grid, fnv::FnvBuildHasher> = DashMap::default(); @@ -232,72 +232,43 @@ impl FeatureMap { log::info!("tracing MS1 features"); // TODO: find a good way to abstract this ... I think a macro would be the way to go. - match spectra { - MS1Spectra::NoMobility(spectra) => spectra.par_iter().for_each(|spectrum| { - let a = alignments[spectrum.file_id]; - let rt = (spectrum.scan_start_time / a.max_rt) * a.slope + a.intercept; - let query = self.rt_slice(rt, RT_TOL); - - for peak in &spectrum.peaks { - for entry in query.mass_lookup(peak.mass) { - let id = match self.settings.combine_charge_states { - true => PrecursorId::Combined(entry.peptide), - false => PrecursorId::Charged((entry.peptide, entry.charge)), - }; - - let mut grid = scores.entry((id, entry.decoy)).or_insert_with(|| { - let p = &db[entry.peptide]; - let composition = p - .sequence - .iter() - .map(|r| composition(*r)) - .sum::(); - let dist = crate::isotopes::peptide_isotopes( - composition.carbon, - composition.sulfur, - ); - Grid::new(entry, RT_TOL, dist, alignments.len(), GRID_SIZE) - }); - - grid.add_entry(rt, entry.isotope, spectrum.file_id, peak.intensity); - } - } - }), - MS1Spectra::WithMobility(spectra) => spectra.par_iter().for_each(|spectrum| { - let a = alignments[spectrum.file_id]; - let rt = (spectrum.scan_start_time / a.max_rt) * a.slope + a.intercept; - let query = self.rt_slice(rt, RT_TOL); - - for peak in &spectrum.peaks { - for entry in query.mass_mobility_lookup(peak.mass, peak.mobility) { - let id = match self.settings.combine_charge_states { - true => PrecursorId::Combined(entry.peptide), - false => PrecursorId::Charged((entry.peptide, entry.charge)), - }; - - let mut grid = scores.entry((id, entry.decoy)).or_insert_with(|| { - let p = &db[entry.peptide]; - let composition = p - .sequence - .iter() - .map(|r| composition(*r)) - .sum::(); - let dist = crate::isotopes::peptide_isotopes( - composition.carbon, - composition.sulfur, - ); - Grid::new(entry, RT_TOL, dist, alignments.len(), GRID_SIZE) - }); - - grid.add_entry(rt, entry.isotope, spectrum.file_id, peak.intensity); - } + spectra.par_iter().for_each(|spectrum| { + let a = alignments[spectrum.file_id]; + let rt = (spectrum.scan_start_time / a.max_rt) * a.slope + a.intercept; + let query = self.rt_slice(rt, RT_TOL); + + for (peak_idx, peak_mass) in spectrum.masses.iter().enumerate() { + + let mob = match &spectrum.mobilities { + Some(mobs) => Some(mobs[peak_idx]), + None => None, + }; + let inten = spectrum.intensities[peak_idx]; + + for entry in query.mass_lookup(*peak_mass, mob){ + let id = match self.settings.combine_charge_states { + true => PrecursorId::Combined(entry.peptide), + false => PrecursorId::Charged((entry.peptide, entry.charge)), + }; + + let mut grid = scores.entry((id, entry.decoy)).or_insert_with(|| { + let p = &db[entry.peptide]; + let composition = p + .sequence + .iter() + .map(|r| composition(*r)) + .sum::(); + let dist = crate::isotopes::peptide_isotopes( + composition.carbon, + composition.sulfur, + ); + Grid::new(entry, RT_TOL, dist, alignments.len(), GRID_SIZE) + }); + + grid.add_entry(rt, entry.isotope, spectrum.file_id, inten); } - }), - MS1Spectra::Empty => { - // Should never be called if no MS1 spectra are present - log::warn!("no MS1 spectra found for quantification"); } - }; + }); log::info!("integrating MS1 features"); @@ -658,7 +629,7 @@ fn convolve(slice: &[f64], kernel: &[f64]) -> Vec { } impl Query<'_> { - pub fn mass_lookup(&self, mass: f32) -> impl Iterator { + pub fn mass_lookup(&self, mass: f32, mobility: Option) -> impl Iterator { (self.page_lo..self.page_hi).flat_map(move |page| { let left_idx = page * self.bin_size; // Last chunk not guaranted to be modulo bucket size, make sure we don't @@ -683,17 +654,12 @@ impl Query<'_> { && mass >= frag.mass_lo && mass <= frag.mass_hi }) - }) - } - - pub fn mass_mobility_lookup( - &self, - mass: f32, - mobility: f32, - ) -> impl Iterator { - self.mass_lookup(mass).filter(move |precursor| { - // TODO: replace this magic number with a patameter. - (precursor.mobility_hi >= mobility) && (precursor.mobility_lo <= mobility) + }).filter(move |precursor| { + if let Some(mobility) = mobility { + (precursor.mobility_hi >= mobility) && (precursor.mobility_lo <= mobility) + } else { + true + } }) } } diff --git a/crates/sage/src/modification.rs b/crates/sage/src/modification.rs index 5d025ea1..afdef9a4 100644 --- a/crates/sage/src/modification.rs +++ b/crates/sage/src/modification.rs @@ -4,7 +4,7 @@ use std::{ str::FromStr, }; -use serde::{de::Visitor, Deserialize, Serialize}; +use serde::Serialize; use crate::mass::VALID_AA; diff --git a/crates/sage/src/scoring.rs b/crates/sage/src/scoring.rs index 2ae1242f..51267a4c 100644 --- a/crates/sage/src/scoring.rs +++ b/crates/sage/src/scoring.rs @@ -2,7 +2,7 @@ use crate::database::{IndexedDatabase, PeptideIx}; use crate::heap::bounded_min_heapify; use crate::ion_series::{IonSeries, Kind}; use crate::mass::{Tolerance, NEUTRON, PROTON}; -use crate::spectrum::{Peak, Precursor, ProcessedSpectrum}; +use crate::spectrum::{Precursor, ProcessedSpectrum}; use serde::{Deserialize, Serialize}; use std::ops::AddAssign; use std::sync::atomic::{AtomicUsize, Ordering}; @@ -245,7 +245,7 @@ fn max_fragment_charge(max_fragment_charge: Option, precursor_charge: u8) -> impl<'db> Scorer<'db> { pub fn quick_score( &self, - query: &ProcessedSpectrum, + query: &ProcessedSpectrum, prefilter_low_memory: bool, ) -> Vec { assert_eq!( @@ -284,7 +284,7 @@ impl<'db> Scorer<'db> { } } - pub fn score(&self, query: &ProcessedSpectrum) -> Vec { + pub fn score(&self, query: &ProcessedSpectrum) -> Vec { assert_eq!( query.level, 2, "internal bug, trying to score a non-MS2 scan!" @@ -321,7 +321,7 @@ impl<'db> Scorer<'db> { /// in sorted order. fn matched_peaks_with_isotope( &self, - query: &ProcessedSpectrum, + query: &ProcessedSpectrum, precursor_mass: f32, precursor_charge: u8, precursor_tol: Tolerance, @@ -342,9 +342,9 @@ impl<'db> Scorer<'db> { preliminary: vec![PreScore::default(); potential], }; - for peak in query.peaks.iter() { + for peak_mass in &query.masses { for charge in 1..max_fragment_charge { - for frag in candidates.page_search(peak.mass, charge) { + for frag in candidates.page_search(*peak_mass, charge) { let idx = frag.peptide_index.0 as usize - candidates.pre_idx_lo; let sc = &mut hits.preliminary[idx]; if sc.matched == 0 { @@ -369,7 +369,7 @@ impl<'db> Scorer<'db> { fn matched_peaks( &self, - query: &ProcessedSpectrum, + query: &ProcessedSpectrum, precursor_mass: f32, precursor_charge: u8, precursor_tol: Tolerance, @@ -401,7 +401,7 @@ impl<'db> Scorer<'db> { } } - fn initial_hits(&self, query: &ProcessedSpectrum, precursor: &Precursor) -> InitialHits { + fn initial_hits(&self, query: &ProcessedSpectrum, precursor: &Precursor) -> InitialHits { // Sage operates on masses without protons; [M] instead of [MH+] let mz = precursor.mz - PROTON; @@ -448,7 +448,7 @@ impl<'db> Scorer<'db> { } /// Score a single [`ProcessedSpectrum`] against the database - pub fn score_standard(&self, query: &ProcessedSpectrum) -> Vec { + pub fn score_standard(&self, query: &ProcessedSpectrum) -> Vec { let precursor = query.precursors.first().unwrap_or_else(|| { panic!("missing MS1 precursor for {}", query.id); }); @@ -463,7 +463,7 @@ impl<'db> Scorer<'db> { /// best PSMs ([`Feature`]) fn build_features( &self, - query: &ProcessedSpectrum, + query: &ProcessedSpectrum, precursor: &Precursor, hits: &InitialHits, report_psms: usize, @@ -576,7 +576,7 @@ impl<'db> Scorer<'db> { } /// Remove peaks matching a PSM from a query spectrum - fn remove_matched_peaks(&self, query: &mut ProcessedSpectrum, psm: &Feature) { + fn remove_matched_peaks(&self, query: &mut ProcessedSpectrum, psm: &Feature) { let peptide = &self.db[psm.peptide_idx]; let fragments = self .db @@ -587,32 +587,54 @@ impl<'db> Scorer<'db> { let max_fragment_charge = max_fragment_charge(self.max_fragment_charge, psm.charge); // Remove MS2 peaks matched by previous match - let mut to_remove = Vec::new(); + let mut to_remove_idx = Vec::new(); for frag in fragments { for charge in 1..max_fragment_charge { // Experimental peaks are multipled by charge, therefore theoretical are divided if let Some(peak) = crate::spectrum::select_most_intense_peak( - &query.peaks, + &query.masses, + &query.intensities, frag.monoisotopic_mass / charge as f32, self.fragment_tol, None, ) { - to_remove.push(*peak); + to_remove_idx.push(peak); } } } - query.peaks = query - .peaks + query.masses = query + .masses .drain(..) - .filter(|peak| !to_remove.contains(peak)) + .enumerate() + .filter(|(idx, _)| !to_remove_idx.contains(idx)) + .map(|(_, mass)| mass) .collect(); - query.total_ion_current = query.peaks.iter().map(|peak| peak.intensity).sum::(); + + query.intensities = query + .intensities + .drain(..) + .enumerate() + .filter(|(idx, _)| !to_remove_idx.contains(idx)) + .map(|(_, intensity)| intensity) + .collect(); + + query.mobilities = match query.mobilities { + Some(ref mut mobilities) => Some(mobilities + .drain(..) + .enumerate() + .filter(|(idx, _)| !to_remove_idx.contains(idx)) + .map(|(_, mobility)| mobility) + .collect()), + None => None, + }; + + query.total_ion_current = query.intensities.iter().sum::(); } /// Return multiple PSMs for each spectra - first is the best match, second PSM is the best match /// after all theoretical peaks assigned to the best match are removed, etc - pub fn score_chimera_fast(&self, query: &ProcessedSpectrum) -> Vec { + pub fn score_chimera_fast(&self, query: &ProcessedSpectrum) -> Vec { let precursor = query.precursors.first().unwrap_or_else(|| { panic!("missing MS1 precursor for {}", query.id); }); @@ -641,7 +663,7 @@ impl<'db> Scorer<'db> { /// Calculate full hyperscore for a given PSM fn score_candidate( &self, - query: &ProcessedSpectrum, + query: &ProcessedSpectrum, pre_score: &PreScore, ) -> (Score, Option) { let mut score = Score { @@ -673,27 +695,29 @@ impl<'db> Scorer<'db> { // Experimental peaks are multipled by charge, therefore theoretical are divided let mz = frag.monoisotopic_mass / charge as f32; - if let Some(peak) = crate::spectrum::select_most_intense_peak( - &query.peaks, + if let Some(peak_idx) = crate::spectrum::select_most_intense_peak( + &query.masses, + &query.intensities, mz, self.fragment_tol, None, ) { score.ppm_difference += - peak.intensity * (mz - peak.mass).abs() * 2E6 / (mz + peak.mass); + query.intensities[peak_idx] * (mz - query.masses[peak_idx]).abs() * 2E6 / (mz + query.masses[peak_idx]); + let peak_intensity = query.intensities[peak_idx]; - let exp_mz = peak.mass + PROTON; + let exp_mz = query.masses[peak_idx] + PROTON; let calc_mz = mz + PROTON; match frag.kind { Kind::A | Kind::B | Kind::C => { score.matched_b += 1; - score.summed_b += peak.intensity; + score.summed_b += peak_intensity; b_run.matched(idx); } Kind::X | Kind::Y | Kind::Z => { score.matched_y += 1; - score.summed_y += peak.intensity; + score.summed_y += peak_intensity; y_run.matched(idx); } } @@ -710,7 +734,7 @@ impl<'db> Scorer<'db> { fragments_details.mz_experimental.push(exp_mz); fragments_details.mz_calculated.push(calc_mz); fragments_details.fragment_ordinals.push(idx); - fragments_details.intensities.push(peak.intensity); + fragments_details.intensities.push(peak_intensity); } } } diff --git a/crates/sage/src/spectrum.rs b/crates/sage/src/spectrum.rs index e5f2849f..ba744c93 100644 --- a/crates/sage/src/spectrum.rs +++ b/crates/sage/src/spectrum.rs @@ -12,15 +12,15 @@ impl Eq for Peak {} impl PartialOrd for Peak { fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) + self.intensity + .partial_cmp(&other.intensity) + .or_else(|| self.mass.partial_cmp(&other.mass)) } } impl Ord for Peak { fn cmp(&self, other: &Self) -> std::cmp::Ordering { - self.intensity - .total_cmp(&other.intensity) - .then_with(|| self.mass.total_cmp(&other.mass)) + self.partial_cmp(other).unwrap_or(std::cmp::Ordering::Equal) } } @@ -80,7 +80,7 @@ pub struct Precursor { } #[derive(Clone, Default, Debug)] -pub struct ProcessedSpectrum { +pub struct ProcessedSpectrum { /// MSn level pub level: u8, /// Scan ID @@ -94,7 +94,9 @@ pub struct ProcessedSpectrum { /// Selected ions for precursors, if `level > 1` pub precursors: Vec, /// MS peaks, sorted by mass in ascending order - pub peaks: Vec, + pub masses: Vec, + pub intensities: Vec, + pub mobilities: Option>, /// Total ion current pub total_ion_current: f32, } @@ -149,18 +151,6 @@ impl Default for Representation { } } -pub enum MS1Spectra { - NoMobility(Vec>), - WithMobility(Vec>), - Empty, -} - -impl Default for MS1Spectra { - fn default() -> Self { - Self::Empty - } -} - /// Binary search followed by linear search to select the most intense peak within `tolerance` window /// * `offset` - this parameter allows for a static adjustment to the lower and upper bounds of the search window. /// Sage subtracts a proton (and assumes z=1) for all experimental peaks, and stores all fragments as monoisotopic @@ -169,31 +159,32 @@ impl Default for MS1Spectra { /// measurements with ProteomeDiscoverer, FragPipe, etc, we need to account for this minor difference (which has an impact /// perhaps 0.01% of the time) pub fn select_most_intense_peak( - peaks: &[Peak], + masses: &[f32], + intensities: &[f32], center: f32, tolerance: Tolerance, offset: Option, -) -> Option<&Peak> { +) -> Option { let (lo, hi) = tolerance.bounds(center); let (lo, hi) = ( lo + offset.unwrap_or_default(), hi + offset.unwrap_or_default(), ); - let (i, j) = binary_search_slice(peaks, |peak, query| peak.mass.total_cmp(query), lo, hi); + let (i, j) = binary_search_slice(masses, |mass, query| mass.total_cmp(query), lo, hi); - let mut best_peak = None; + let mut best_peak_idx = None; let mut max_int = 0.0; - for peak in peaks[i..j] - .iter() - .filter(|peak| peak.mass >= lo && peak.mass <= hi) + for peak_idx in (i..j) + .into_iter() + .filter(|peak_idx| masses[*peak_idx] >= lo && masses[*peak_idx] <= hi) { - if peak.intensity >= max_int { - max_int = peak.intensity; - best_peak = Some(peak); + if intensities[peak_idx] >= max_int { + max_int = intensities[peak_idx]; + best_peak_idx = Some(peak_idx); } } - best_peak + best_peak_idx } // pub fn find_spectrum_by_id( @@ -276,7 +267,7 @@ pub fn path_compression(peaks: &mut [Deisotoped]) { } } -impl ProcessedSpectrum { +impl ProcessedSpectrum { pub fn extract_ms1_precursor(&self) -> Option<(f32, u8)> { let precursor = self.precursors.first()?; let charge = precursor.charge?; @@ -348,7 +339,7 @@ impl SpectrumProcessor { } }) .take(self.take_top_n) - .collect::>() + .collect::>() } else { let mut peaks = spectrum .mz @@ -365,22 +356,22 @@ impl SpectrumProcessor { } } - pub fn process(&self, spectrum: RawSpectrum) -> ProcessedSpectrum { + pub fn process(&self, spectrum: RawSpectrum) -> ProcessedSpectrum { let mut peaks = match spectrum.ms_level { 2 => self.process_ms2(self.deisotope, &spectrum), - _ => spectrum - .mz - .iter() - .zip(spectrum.intensity.iter()) - .map(|(&mass, &intensity)| { + _ => (spectrum.mz.iter().zip(spectrum.intensity.iter())).map(|(&mass, &intensity)| { let mass = (mass - PROTON) * 1.0; Peak { mass, intensity } }) - .collect::>(), + .collect::>() }; peaks.sort_by(|a, b| a.mass.total_cmp(&b.mass)); - let total_ion_current = peaks.iter().map(|peak| peak.intensity).sum::(); + let (masses, intensities): (Vec<_>, Vec<_>) = peaks + .into_iter() + .map(|peak| (peak.mass, peak.intensity)) + .unzip(); + let total_ion_current = intensities.iter().sum::(); ProcessedSpectrum { level: spectrum.ms_level, @@ -389,12 +380,14 @@ impl SpectrumProcessor { scan_start_time: spectrum.scan_start_time, ion_injection_time: spectrum.ion_injection_time, precursors: spectrum.precursors, - peaks, + masses, + intensities, + mobilities: None, total_ion_current, } } - pub fn process_with_mobility(&self, spectrum: RawSpectrum) -> ProcessedSpectrum { + pub fn process_with_mobility(&self, spectrum: RawSpectrum) -> ProcessedSpectrum { assert!( spectrum.ms_level == 1, "Logic error, mobility processing should only be used for MS1" @@ -419,7 +412,12 @@ impl SpectrumProcessor { .collect::>(); peaks.sort_by(|a, b| a.mass.total_cmp(&b.mass)); - let total_ion_current = peaks.iter().map(|peak| peak.intensity).sum::(); + let (masses, (intensities, mobilities)) = peaks + .into_iter() + .map(|peak| (peak.mass, (peak.intensity, peak.mobility))) + .into_iter() + .collect::<(Vec<_>, (Vec<_>, Vec<_>))>(); + let total_ion_current = intensities.iter().sum::(); ProcessedSpectrum { level: spectrum.ms_level, @@ -428,7 +426,9 @@ impl SpectrumProcessor { scan_start_time: spectrum.scan_start_time, ion_injection_time: spectrum.ion_injection_time, precursors: spectrum.precursors, - peaks, + masses, + intensities, + mobilities: Some(mobilities), total_ion_current, } } diff --git a/crates/sage/src/tmt.rs b/crates/sage/src/tmt.rs index 00b6030d..9379b11c 100644 --- a/crates/sage/src/tmt.rs +++ b/crates/sage/src/tmt.rs @@ -183,22 +183,34 @@ pub struct Quant<'ms3> { /// Quanitified TMT reporter ion intensities pub intensities: Vec>, /// MS3 spectrum - pub spectrum: &'ms3 ProcessedSpectrum, + pub spectrum: &'ms3 ProcessedSpectrum, } /// Return a vector containing the peaks closest to the m/zs defined in /// `labels`, within a given tolerance window. /// This function is MS-level agnostic, so it can be used for either MS2 or MS3 /// quant. -pub fn find_reporter_ions<'a>( - peaks: &'a [Peak], +pub fn find_reporter_ion_intensities<'a>( + masses: &'a [f32], + intensities: &'a [f32], labels: &[f32], label_tolerance: Tolerance, -) -> Vec> { +) -> Vec { labels .iter() .map(|&label| { - spectrum::select_most_intense_peak(peaks, label, label_tolerance, Some(-PROTON)) + let idx = spectrum::select_most_intense_peak( + masses, + intensities, + label, + label_tolerance, + Some(-PROTON), + ); + if let Some(idx) = idx { + intensities[idx] + } else { + 0.0 + } }) .collect() } @@ -305,7 +317,7 @@ pub struct TmtQuant { /// * `isobaric_tolerance`: specify label tolerance /// * `level`: MSn level to extract isobaric peaks from pub fn quantify( - spectra: &[ProcessedSpectrum], + spectra: &[ProcessedSpectrum], isobaric_labels: &Isobaric, isobaric_tolerance: Tolerance, level: u8, @@ -324,14 +336,12 @@ pub fn quantify( .unwrap_or_default(), }; - let peaks = find_reporter_ions( - &spectrum.peaks, + let peaks = find_reporter_ion_intensities( + &spectrum.masses, + &spectrum.intensities, isobaric_labels.reporter_masses(), isobaric_tolerance, - ) - .into_iter() - .map(|peak| peak.map(|p| p.intensity).unwrap_or_default()) - .collect(); + ); Some(TmtQuant { spec_id, From feae91853a53397957a89b8925a05b729cd5b901 Mon Sep 17 00:00:00 2001 From: "J. Sebastian Paez" Date: Wed, 9 Apr 2025 14:29:51 -0700 Subject: [PATCH 2/2] test,fix: Fixed test that referenced peaks and not masses --- crates/sage-cli/tests/integration.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/sage-cli/tests/integration.rs b/crates/sage-cli/tests/integration.rs index 53370172..2b9bcf29 100644 --- a/crates/sage-cli/tests/integration.rs +++ b/crates/sage-cli/tests/integration.rs @@ -15,7 +15,7 @@ fn integration() -> anyhow::Result<()> { let sp = SpectrumProcessor::new(100, true, 0.0); let processed = sp.process(spectra[0].clone()); - assert!(processed.peaks.len() <= 300); + assert!(processed.masses.len() <= 300); let scorer = Scorer { db: &database,