From a991ab9e33e3d1195f7b1301977dc18aa4af3b12 Mon Sep 17 00:00:00 2001 From: Justin Sing <32938975+singjc@users.noreply.github.com> Date: Wed, 7 May 2025 10:07:52 -0400 Subject: [PATCH 01/75] Update model_interface.rs Add Clone marco to Parameters struct --- crates/redeem-properties/src/models/model_interface.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/redeem-properties/src/models/model_interface.rs b/crates/redeem-properties/src/models/model_interface.rs index d8c08af..3aad160 100644 --- a/crates/redeem-properties/src/models/model_interface.rs +++ b/crates/redeem-properties/src/models/model_interface.rs @@ -625,6 +625,7 @@ pub trait ModelInterface: Send + Sync { } /// Parameters for the `predict` method of a `ModelInterface` implementation. +#[derive(Clone)] pub struct Parameters { /// The instrument data was acquired on. Refer to list of supported instruments in const yaml file. pub instrument: String, From 8f338557787bd20edc56e9cd28932a13954acf27 Mon Sep 17 00:00:00 2001 From: singjc Date: Wed, 7 May 2025 11:27:25 -0400 Subject: [PATCH 02/75] chore: Add Clone trait implementation for ModelInterface --- .../redeem-properties/src/models/ccs_model.rs | 1 + .../src/models/model_interface.rs | 22 ++++++++++++++++++- .../redeem-properties/src/models/ms2_model.rs | 1 + .../redeem-properties/src/models/rt_model.rs | 1 + 4 files changed, 24 insertions(+), 1 deletion(-) diff --git a/crates/redeem-properties/src/models/ccs_model.rs b/crates/redeem-properties/src/models/ccs_model.rs index 62a35ed..36d5e5f 100644 --- a/crates/redeem-properties/src/models/ccs_model.rs +++ b/crates/redeem-properties/src/models/ccs_model.rs @@ -17,6 +17,7 @@ pub enum CCSModelArch { pub const CCSMODEL_ARCHS: &[&str] = &["ccs_cnn_lstm"]; // A wrapper struct for CCS models +#[derive(Clone)] pub struct CCSModelWrapper { model: Box, } diff --git a/crates/redeem-properties/src/models/model_interface.rs b/crates/redeem-properties/src/models/model_interface.rs index d8c08af..d5aca40 100644 --- a/crates/redeem-properties/src/models/model_interface.rs +++ b/crates/redeem-properties/src/models/model_interface.rs @@ -176,10 +176,30 @@ pub fn create_var_map( } +pub trait ModelClone { + fn clone_box(&self) -> Box; +} + +impl ModelClone for T +where + T: 'static + ModelInterface + Clone, +{ + fn clone_box(&self) -> Box { + Box::new(self.clone()) + } +} + +impl Clone for Box { + fn clone(&self) -> Box { + self.clone_box() + } +} + + /// Represents an abstract deep learning model interface. /// /// This trait defines the methods and properties that a deep learning model must implement to be used for property prediction tasks. -pub trait ModelInterface: Send + Sync { +pub trait ModelInterface: Send + Sync + ModelClone { /// Get the property type of the model. fn property_type(&self) -> PropertyType; diff --git a/crates/redeem-properties/src/models/ms2_model.rs b/crates/redeem-properties/src/models/ms2_model.rs index fc590cd..cf1979b 100644 --- a/crates/redeem-properties/src/models/ms2_model.rs +++ b/crates/redeem-properties/src/models/ms2_model.rs @@ -17,6 +17,7 @@ pub enum MS2ModelArch { pub const MS2MODEL_ARCHS: &[&str] = &["ms2_bert"]; // A wrapper struct for MS2 models +#[derive(Clone)] pub struct MS2ModelWrapper { model: Box, } diff --git a/crates/redeem-properties/src/models/rt_model.rs b/crates/redeem-properties/src/models/rt_model.rs index 65dea9d..20086c9 100644 --- a/crates/redeem-properties/src/models/rt_model.rs +++ b/crates/redeem-properties/src/models/rt_model.rs @@ -19,6 +19,7 @@ pub enum RTModelArch { pub const RTMODEL_ARCHS: &[&str] = &["rt_cnn_lstm"]; // A wrapper struct for RT models +#[derive(Clone)] pub struct RTModelWrapper { model: Box, } From 5860df81cc65c0fa996173e976a47d6fe9c03709 Mon Sep 17 00:00:00 2001 From: singjc Date: Wed, 7 May 2025 11:38:20 -0400 Subject: [PATCH 03/75] refactor: Update model structs to use 'static lifetime for VarBuilder --- .../src/models/ccs_cnn_lstm_model.rs | 14 +++++++------- .../redeem-properties/src/models/ms2_bert_model.rs | 14 +++++++------- .../src/models/rt_cnn_lstm_model.rs | 12 ++++++------ 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/crates/redeem-properties/src/models/ccs_cnn_lstm_model.rs b/crates/redeem-properties/src/models/ccs_cnn_lstm_model.rs index 156ada6..bde8a86 100644 --- a/crates/redeem-properties/src/models/ccs_cnn_lstm_model.rs +++ b/crates/redeem-properties/src/models/ccs_cnn_lstm_model.rs @@ -32,8 +32,8 @@ const NCE_FACTOR: f64 = 0.01; // Main Model Struct #[derive(Clone)] /// Represents an AlphaPeptDeep MS2BERT model. -pub struct CCSCNNLSTMModel<'a> { - var_store: VarBuilder<'a>, +pub struct CCSCNNLSTMModel { + var_store: VarBuilder<'static>, varmap: VarMap, constants: ModelConstants, mod_to_feature: HashMap>, @@ -52,11 +52,11 @@ pub struct CCSCNNLSTMModel<'a> { } // Automatically implement Send and Sync if all fields are Send and Sync -unsafe impl<'a> Send for CCSCNNLSTMModel<'a> {} -unsafe impl<'a> Sync for CCSCNNLSTMModel<'a> {} +unsafe impl Send for CCSCNNLSTMModel {} +unsafe impl Sync for CCSCNNLSTMModel {} // Code Model Implementation -impl<'a> ModelInterface for CCSCNNLSTMModel<'a> { +impl ModelInterface for CCSCNNLSTMModel { fn property_type(&self) -> PropertyType { PropertyType::CCS } @@ -217,13 +217,13 @@ impl<'a> ModelInterface for CCSCNNLSTMModel<'a> { // // Forward Module Trait Implementation -// impl <'a> Module for CCSCNNLSTMModel<'a> { +// impl Module for CCSCNNLSTMModel { // fn forward(&self, input: &Tensor) -> Result { // ModelInterface::forward(self, input) // } // } -impl<'a> fmt::Debug for CCSCNNLSTMModel<'a> { +impl fmt::Debug for CCSCNNLSTMModel { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { writeln!(f, "ModelCCS_LSTM(")?; writeln!(f, " (dropout): Dropout(p={}, inplace={})", 0.1, false)?; diff --git a/crates/redeem-properties/src/models/ms2_bert_model.rs b/crates/redeem-properties/src/models/ms2_bert_model.rs index 204374e..9f3a1d3 100644 --- a/crates/redeem-properties/src/models/ms2_bert_model.rs +++ b/crates/redeem-properties/src/models/ms2_bert_model.rs @@ -36,8 +36,8 @@ const NCE_FACTOR: f64 = 0.01; // Main Model Struct #[derive(Clone)] /// Represents an AlphaPeptDeep MS2BERT model. -pub struct MS2BertModel<'a> { - var_store: VarBuilder<'a>, +pub struct MS2BertModel { + var_store: VarBuilder<'static>, varmap: VarMap, constants: ModelConstants, mod_to_feature: HashMap>, @@ -60,11 +60,11 @@ pub struct MS2BertModel<'a> { } // Automatically implement Send and Sync if all fields are Send and Sync -unsafe impl<'a> Send for MS2BertModel<'a> {} -unsafe impl<'a> Sync for MS2BertModel<'a> {} +unsafe impl Send for MS2BertModel {} +unsafe impl Sync for MS2BertModel {} // Code Model Implementation -impl<'a> ModelInterface for MS2BertModel<'a> { +impl ModelInterface for MS2BertModel { fn property_type(&self) -> PropertyType { PropertyType::MS2 } @@ -342,13 +342,13 @@ impl<'a> ModelInterface for MS2BertModel<'a> { } // // Module Trait Implementation -// impl<'a> Module for MS2BertModel<'a> { +// impl Module for MS2BertModel { // fn forward(&self, input: &Tensor) -> Result { // ModelInterface::forward(self, input) // } // } -impl<'a> fmt::Debug for MS2BertModel<'a> { +impl fmt::Debug for MS2BertModel { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { writeln!(f, "MS2BertModel(")?; writeln!(f, " (dropout): Dropout(p={})", 0.1)?; diff --git a/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs b/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs index 49ca8ee..3d5abc9 100644 --- a/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs +++ b/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs @@ -25,8 +25,8 @@ use crate::utils::logging::Progress; #[derive(Clone)] /// Represents an AlphaPeptDeep CNN-LSTM Retention Time model. -pub struct RTCNNLSTMModel<'a> { - var_store: VarBuilder<'a>, +pub struct RTCNNLSTMModel { + var_store: VarBuilder<'static>, varmap: VarMap, constants: ModelConstants, device: Device, @@ -38,12 +38,12 @@ pub struct RTCNNLSTMModel<'a> { } // Automatically implement Send and Sync if all fields are Send and Sync -unsafe impl<'a> Send for RTCNNLSTMModel<'a> {} -unsafe impl<'a> Sync for RTCNNLSTMModel<'a> {} +unsafe impl Send for RTCNNLSTMModel {} +unsafe impl Sync for RTCNNLSTMModel {} // Core Model Implementation -impl<'a> ModelInterface for RTCNNLSTMModel<'a> { +impl ModelInterface for RTCNNLSTMModel { fn property_type(&self) -> PropertyType { PropertyType::RT } @@ -268,7 +268,7 @@ impl<'a> ModelInterface for RTCNNLSTMModel<'a> { // Module Trait Implementation -// impl<'a> Module for RTCNNLSTMModel<'a> { +// impl Module for RTCNNLSTMModel { // fn forward(&self, input: &Tensor) -> Result { // ModelInterface::forward(self, input) // } From 8be72ea96e433604eebba2da1c4a6bbc7aba619f Mon Sep 17 00:00:00 2001 From: singjc Date: Wed, 7 May 2025 11:43:35 -0400 Subject: [PATCH 04/75] refactor: Update model structs to use 'static lifetime for VarBuilder --- crates/redeem-properties/src/models/ccs_model.rs | 9 ++++++++- crates/redeem-properties/src/models/ms2_model.rs | 9 ++++++++- crates/redeem-properties/src/models/rt_model.rs | 10 +++++++++- 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/crates/redeem-properties/src/models/ccs_model.rs b/crates/redeem-properties/src/models/ccs_model.rs index 36d5e5f..6adbdbb 100644 --- a/crates/redeem-properties/src/models/ccs_model.rs +++ b/crates/redeem-properties/src/models/ccs_model.rs @@ -17,11 +17,18 @@ pub enum CCSModelArch { pub const CCSMODEL_ARCHS: &[&str] = &["ccs_cnn_lstm"]; // A wrapper struct for CCS models -#[derive(Clone)] pub struct CCSModelWrapper { model: Box, } +impl Clone for CCSModelWrapper { + fn clone(&self) -> Self { + CCSModelWrapper { + model: self.model.clone(), + } + } +} + impl CCSModelWrapper { pub fn new>(model_path: P, constants_path: P, arch: &str, device: Device) -> Result { let model: Box = match arch { diff --git a/crates/redeem-properties/src/models/ms2_model.rs b/crates/redeem-properties/src/models/ms2_model.rs index cf1979b..f4ed7e1 100644 --- a/crates/redeem-properties/src/models/ms2_model.rs +++ b/crates/redeem-properties/src/models/ms2_model.rs @@ -17,11 +17,18 @@ pub enum MS2ModelArch { pub const MS2MODEL_ARCHS: &[&str] = &["ms2_bert"]; // A wrapper struct for MS2 models -#[derive(Clone)] pub struct MS2ModelWrapper { model: Box, } +impl Clone for MS2ModelWrapper { + fn clone(&self) -> Self { + MS2ModelWrapper { + model: self.model.clone(), + } + } +} + impl MS2ModelWrapper { pub fn new>(model_path: P, constants_path: P, arch: &str, device: Device) -> Result { let model: Box = match arch { diff --git a/crates/redeem-properties/src/models/rt_model.rs b/crates/redeem-properties/src/models/rt_model.rs index 20086c9..d6cc501 100644 --- a/crates/redeem-properties/src/models/rt_model.rs +++ b/crates/redeem-properties/src/models/rt_model.rs @@ -19,11 +19,19 @@ pub enum RTModelArch { pub const RTMODEL_ARCHS: &[&str] = &["rt_cnn_lstm"]; // A wrapper struct for RT models -#[derive(Clone)] pub struct RTModelWrapper { model: Box, } +impl Clone for RTModelWrapper { + fn clone(&self) -> Self { + RTModelWrapper { + model: self.model.clone(), // uses clone_box() behind the scenes + } + } +} + + impl RTModelWrapper { pub fn new>(model_path: P, constants_path: P, arch: &str, device: Device) -> Result { let model: Box = match arch { From ac5afe96d35a6fbce10c99e4ba06fb7477d944d2 Mon Sep 17 00:00:00 2001 From: singjc Date: Wed, 7 May 2025 11:46:28 -0400 Subject: [PATCH 05/75] refactor: Update ModelClone trait to include Send and Sync bounds --- .../redeem-properties/src/models/model_interface.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/crates/redeem-properties/src/models/model_interface.rs b/crates/redeem-properties/src/models/model_interface.rs index 4d63f35..78fe0a3 100644 --- a/crates/redeem-properties/src/models/model_interface.rs +++ b/crates/redeem-properties/src/models/model_interface.rs @@ -177,25 +177,26 @@ pub fn create_var_map( pub trait ModelClone { - fn clone_box(&self) -> Box; + fn clone_box(&self) -> Box; } + impl ModelClone for T where - T: 'static + ModelInterface + Clone, + T: 'static + ModelInterface + Clone + Send + Sync, { - fn clone_box(&self) -> Box { + fn clone_box(&self) -> Box { Box::new(self.clone()) } } -impl Clone for Box { - fn clone(&self) -> Box { + +impl Clone for Box { + fn clone(&self) -> Self { self.clone_box() } } - /// Represents an abstract deep learning model interface. /// /// This trait defines the methods and properties that a deep learning model must implement to be used for property prediction tasks. From f41eeb5bca12c7af17544793aef906e4577cfbd8 Mon Sep 17 00:00:00 2001 From: singjc Date: Wed, 7 May 2025 11:51:55 -0400 Subject: [PATCH 06/75] refactor: Update DLModels struct to remove unnecessary Arc and Mutex wrappers for model fields --- crates/redeem-properties/src/models/model_interface.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/crates/redeem-properties/src/models/model_interface.rs b/crates/redeem-properties/src/models/model_interface.rs index 78fe0a3..1149a23 100644 --- a/crates/redeem-properties/src/models/model_interface.rs +++ b/crates/redeem-properties/src/models/model_interface.rs @@ -689,13 +689,13 @@ pub struct DLModels { pub params: Option, /// Optional retention time prediction model. - pub rt_model: Option>>, + pub rt_model: Option, /// Optional collision cross-section prediction model. - pub ccs_model: Option>>, + pub ccs_model: Option, /// Optional MS2 intensity prediction model. - pub ms2_model: Option>>, + pub ms2_model: Option, } impl DLModels { @@ -710,7 +710,7 @@ impl DLModels { /// ``` /// let mut models = DLModels::new(); /// - /// models.rt_model = Some(Arc::new(Mutex::new(RTModelWrapper::new()))); + /// models.rt_model = Some(RTModelWrapper::new()); /// /// ``` pub fn new() -> Self { @@ -734,7 +734,7 @@ impl DLModels { /// let mut models = DLModels::new(); /// assert!(!models.is_not_empty()); /// - /// models.rt_model = Some(Arc::new(Mutex::new(RTModelWrapper::new()))); + /// models.rt_model = Some(RTModelWrapper::new()); /// assert!(models.is_not_empty()); /// ``` pub fn is_not_empty(&self) -> bool { From 6eecf07301b0831aee506b928c41ea5dfa61449d Mon Sep 17 00:00:00 2001 From: singjc Date: Thu, 8 May 2025 10:41:04 -0400 Subject: [PATCH 07/75] refactor: Update peptide modification handling to support mass shifts and UniMod annotations --- .../src/utils/peptdeep_utils.rs | 206 +++++++++++++++--- 1 file changed, 174 insertions(+), 32 deletions(-) diff --git a/crates/redeem-properties/src/utils/peptdeep_utils.rs b/crates/redeem-properties/src/utils/peptdeep_utils.rs index e3fbc73..2b2d6f1 100644 --- a/crates/redeem-properties/src/utils/peptdeep_utils.rs +++ b/crates/redeem-properties/src/utils/peptdeep_utils.rs @@ -184,48 +184,76 @@ pub fn load_mod_to_feature(constants: &ModelConstants) -> Result, // Optional if not applicable + pub unimod_id: Option } +/// Loads a unified modification map where the key is either: +/// - ("57.0215", Some('C')) for mass-based lookup +/// - ("UniMod:4", Some('C')) for UniMod ID–based lookup pub fn load_modifications() -> Result), ModificationMap>> { let path: PathBuf = ensure_mod_tsv_exists().context("Failed to ensure TSV exists")?; let mut rdr = ReaderBuilder::new() .delimiter(b'\t') - .from_path(path).context("Failed to read TSV file")?; + .from_path(&path) + .context("Failed to read modification TSV file")?; let mut modifications = HashMap::new(); - + for result in rdr.records() { let record = result.context("Failed to read record")?; let mod_name = record.get(0).unwrap_or("").to_string(); let unimod_mass: f64 = record.get(1).unwrap_or("0").parse().unwrap_or(0.0); - - // Convert mass to string with 4 decimal places + let unimod_id: Option = record.get(7).and_then(|s| s.parse().ok()); + let mass_key = format!("{:.4}", unimod_mass); - - // Extract amino acid from mod_name + let unimod_key = unimod_id.map(|id| format!("UniMod:{}", id)); + let amino_acid = mod_name.split('@').nth(1).and_then(|aa| aa.chars().next()); - // Create Modification struct let modification = ModificationMap { name: mod_name, amino_acid, + unimod_id, }; - // Insert into HashMap - modifications.insert((mass_key, amino_acid), modification); + // Insert mass-based key + modifications.insert((mass_key.clone(), amino_acid), modification.clone()); + + // Insert unimod-id based key if available + if let Some(key) = unimod_key { + modifications.insert((key, amino_acid), modification.clone()); + } } Ok(modifications) } + + + +/// Removes mass shifts and UniMod annotations from a modified peptide sequence. +/// +/// Supports both bracketed mass shifts (e.g., `[+57.0215]`) and UniMod-style +/// annotations (e.g., `(UniMod:4)`). +/// +/// # Example +/// ``` +/// use easypqp_core::data_handling::remove_mass_shift; +/// +/// let peptide = "MGC[+57.0215]AAR"; +/// assert_eq!(remove_mass_shift(peptide), "MGCAAR"); +/// let peptide = "MGC(UniMod:4)AAR"; +/// assert_eq!(remove_mass_shift(peptide), "MGCAAR"); +/// ``` pub fn remove_mass_shift(peptide: &str) -> String { - let re = Regex::new(r"\[.*?\]").unwrap(); + // Regex to remove either [mass shift] or (UniMod:x) patterns + let re = Regex::new(r"(\[.*?\]|\(UniMod:\d+\))").unwrap(); re.replace_all(peptide, "").to_string() } @@ -283,37 +311,151 @@ pub fn get_modification_indices(peptide: &str) -> String { indices.join(";") } -pub fn get_modification_string( - peptide: &str, - modification_map: &HashMap<(String, Option), ModificationMap>, -) -> String { - let naked_peptide = remove_mass_shift(peptide); - let extracted_masses_and_indices = extract_masses_and_indices(&peptide.to_string()); - let mut found_modifications = Vec::new(); +/// Extracts mass shift annotations (e.g., [+57.0215]) from a peptide string and returns them +/// as a vector of (mass_string, position) where position is the index of the annotated amino acid. +/// +/// # Example +/// ``` +/// use redeem_properties::utils::peptdeep_utils::extract_mass_annotations; +/// let result = extract_mass_annotations("AC[+57.0215]DE"); +/// assert_eq!(result, vec![("57.0215".to_string(), 2)]); +/// ``` +pub fn extract_mass_annotations(peptide: &str) -> Vec<(String, usize)> { + let re_mass = Regex::new(r"\[([+-]?\d*\.?\d+)\]").unwrap(); + let mut results = Vec::new(); + let mut offset = 0; + let mut idx = 0; + + while idx < peptide.len() { + if let Some(mat) = re_mass.find_at(peptide, idx) { + if mat.start() == idx { + let cap = re_mass.captures(&peptide[idx..mat.end()]).unwrap(); + let mass_str = format!("{:.4}", cap[1].parse::().unwrap_or(0.0)); + let pos = idx - offset; + results.push((mass_str, pos)); + offset += mat.end() - mat.start(); + idx = mat.end(); + continue; + } + } + idx += peptide[idx..].chars().next().unwrap().len_utf8(); + } + + results +} - // Map modifications based on extracted masses and indices - for (mass, index) in extracted_masses_and_indices { - // Subtract 1 from index to get 0-based index, ensure it's within bounds - let index = index.saturating_sub(1); - let amino_acid = naked_peptide.chars().nth(index).unwrap_or('\0'); - if let Some(modification) = modification_map - .get(&(format!("{:.4}", mass), Some(amino_acid))) - { - found_modifications.push(modification.name.clone()); - } else if let Some(modification) = - modification_map.get(&(format!("{:.4}", mass), None)) - { - found_modifications.push(modification.name.clone()); +/// Extracts UniMod annotations (e.g., (UniMod:4)) from a peptide string and returns them +/// as a vector of (unimod_id_string, position) where position is the index of the annotated amino acid. +/// +/// # Example +/// ``` +/// use redeem_properties::utils::peptdeep_utils::extract_unimod_annotations; +/// let result = extract_unimod_annotations("AC(UniMod:4)DE"); +/// assert_eq!(result, vec![("UniMod:4".to_string(), 2)]); +/// ``` +pub fn extract_unimod_annotations(peptide: &str) -> Vec<(String, usize)> { + let re_unimod = Regex::new(r"\(UniMod:(\d+)\)").unwrap(); + let mut results = Vec::new(); + let mut offset = 0; + let mut idx = 0; + + while idx < peptide.len() { + if let Some(mat) = re_unimod.find_at(peptide, idx) { + if mat.start() == idx { + let cap = re_unimod.captures(&peptide[idx..mat.end()]).unwrap(); + let unimod_str = format!("UniMod:{}", &cap[1]); + let pos = idx - offset; + results.push((unimod_str, pos)); + offset += mat.end() - mat.start(); + idx = mat.end(); + continue; + } } + idx += peptide[idx..].chars().next().unwrap().len_utf8(); } - found_modifications.join(";") + results } +/// Attempts to look up a modification name from a map using the provided key and amino acid. +/// Falls back to a key with `None` if the exact amino acid is not matched. +/// +/// # Example +/// ``` +/// use redeem_properties::utils::peptdeep_utils::{ModificationMap, lookup_modification}; +/// let mut map = std::collections::HashMap::new(); +/// map.insert(("57.0215".to_string(), Some('C')), ModificationMap { name: "Carbamidomethyl@C".to_string(), amino_acid: Some('C'), unimod_id: Some(4) }); +/// +/// let result = lookup_modification("57.0215".to_string(), 'C', &map); +/// assert_eq!(result, Some("Carbamidomethyl@C".to_string())); +/// ``` +pub fn lookup_modification( + key: String, + aa: char, + map: &HashMap<(String, Option), ModificationMap>, +) -> Option { + map.get(&(key.clone(), Some(aa))) + .or_else(|| map.get(&(key, None))) + .map(|m| m.name.clone()) +} + + + +/// Generates a standardized modification string (e.g., "Carbamidomethyl@C") +/// for a peptide sequence based on mass shifts (e.g., `[+57.0215]`) or +/// UniMod annotations (e.g., `(UniMod:4)`), using a preloaded modification map. +/// +/// The function supports both mass-shift format and UniMod notation, +/// matching entries from the `modification_map` using mass or UniMod ID along +/// with the local amino acid context. +/// +/// # Arguments +/// * `peptide` - A modified peptide sequence string (e.g., `"MGC[+57.0215]AAR"` or `"MGC(UniMod:4)AAR"`). +/// * `modification_map` - A HashMap mapping (key, amino_acid) to `ModificationMap`. +/// - For `[+mass]`, key is formatted as a mass string (e.g., `"57.0215"`). +/// - For `(UniMod:ID)`, key is the UniMod ID as string (e.g., `"4"`). +/// +/// # Returns +/// A `String` containing semicolon-separated modification names (e.g., `"Carbamidomethyl@C"`). +/// +/// # Example +/// ``` +/// use std::collections::HashMap; +/// use redeem_properties::utils::peptdeep_utils::{load_modifications, get_modification_string}; +/// +/// let mod_map = load_modifications().unwrap(); +/// let peptide1 = "MGC[+57.0215]AAR"; +/// let result1 = get_modification_string(peptide1, &mod_map); +/// assert_eq!(result1, "Carbamidomethyl@C"); +/// +/// let peptide2 = "MGC(UniMod:4)AAR"; +/// let result2 = get_modification_string(peptide2, &mod_map); +/// assert_eq!(result2, "Carbamidomethyl@C"); +/// ``` +pub fn get_modification_string( + peptide: &str, + modification_map: &HashMap<(String, Option), ModificationMap>, +) -> String { + let naked_peptide = remove_mass_shift(peptide); + let mut found_mods = Vec::new(); + + for (key, pos) in extract_mass_annotations(peptide) + .into_iter() + .chain(extract_unimod_annotations(peptide)) + { + let aa = naked_peptide.chars().nth(pos.saturating_sub(1)).unwrap_or('\0'); + if let Some(name) = lookup_modification(key, aa, modification_map) { + found_mods.push(name); + } + } + + found_mods.join(";") +} + // TODO: Derive from PeptDep constants yaml const IM_GAS_MASS: f64 = 28.0; From 1c70ac628940efe2879b70206c3ffb8655e349f3 Mon Sep 17 00:00:00 2001 From: singjc Date: Thu, 8 May 2025 15:09:36 -0400 Subject: [PATCH 08/75] refactor: peptide encoding --- .../src/building_blocks/featurize.rs | 175 +++++++- .../src/models/model_interface.rs | 419 +++++++++++------- .../src/models/rt_cnn_lstm_model.rs | 75 +++- 3 files changed, 516 insertions(+), 153 deletions(-) diff --git a/crates/redeem-properties/src/building_blocks/featurize.rs b/crates/redeem-properties/src/building_blocks/featurize.rs index 1161d84..b9beb72 100644 --- a/crates/redeem-properties/src/building_blocks/featurize.rs +++ b/crates/redeem-properties/src/building_blocks/featurize.rs @@ -1,13 +1,27 @@ use anyhow::{Result, anyhow}; use std::{collections::HashMap, ops::Deref}; use ndarray::Array2; -use candle_core::{Device, Tensor}; +use candle_core::{DType, Device, Tensor}; use crate::building_blocks::building_blocks::AA_EMBEDDING_SIZE; /// Convert peptide sequences into AA ID array. /// /// Based on https://github.com/MannLabs/alphapeptdeep/blob/450518a39a4cd7d03db391108ec8700b365dd436/peptdeep/model/featurize.py#L88 +/// +/// Example: +/// ```rust +/// use redeem_properties::building_blocks::featurize::get_aa_indices; +/// use anyhow::Result; +/// use ndarray::Array2; +/// +/// let seq = "AGHCEWQMKYR"; +/// let result = get_aa_indices(seq).unwrap(); +/// println!("aa_indices: {:?}", result); +/// let expect_out = Array2::from_shape_vec((1, 13), vec![0, 1, 7, 8, 3, 5, 23, 17, 13, 11, 25, 18, 0]).unwrap(); +/// assert_eq!(result.shape(), &[1, 13]); +/// assert_eq!(result, expect_out); +/// ``` pub fn get_aa_indices(seq: &str) -> Result> { let valid_aa = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; // amino acids as defined in alphabase: https://github.com/MannLabs/alphabase/blob/main/alphabase/constants/const_files/amino_acid.tsv let filtered_seq: String = seq.chars().filter(|c| valid_aa.contains(*c)).collect(); @@ -110,4 +124,163 @@ pub fn get_mod_features(mods: &str, mod_sites: &str, seq_len: usize, mod_feature Tensor::from_slice(&mod_x, (1, seq_len, mod_feature_size), &device) .map_err(|e| anyhow!("Failed to create tensor: {}", e)) +} + + +const VALID_AA: &str = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + +/// Precomputes amino acid index map from characters A-Z +fn aa_index_map() -> HashMap { + VALID_AA + .chars() + .enumerate() + .map(|(i, c)| (c, i as i64 + 1)) + .collect() +} + +/// Efficiently converts an amino acid sequence to a padded tensor of indices +pub fn aa_indices_tensor(seq: &str, device: &Device) -> Result { + let map = aa_index_map(); + let filtered: Vec = seq + .chars() + .filter_map(|c| map.get(&c).copied()) + .collect(); + let mut indices = vec![0i64]; // padding start + indices.extend(filtered); + indices.push(0); // padding end + + Ok(Tensor::from_slice(&indices, (1, indices.len()), device)?.to_dtype(DType::F32)?.unsqueeze(2)?) +} + + +/// Optimized version of get_mod_features that avoids repeated parsing +pub fn get_mod_features_from_parsed( + mod_names: &[&str], + mod_sites: &[usize], + seq_len: usize, + mod_feature_size: usize, + mod_to_feature: &HashMap>, + device: &Device, +) -> Result { + let mut mod_x = vec![0.0f32; seq_len * mod_feature_size]; + + for (mod_name, &site) in mod_names.iter().zip(mod_sites.iter()) { + if site >= seq_len { + log::warn!("Skipping mod {} at invalid site {} (seq_len {})", mod_name, site, seq_len); + continue; + } + if let Some(feat) = mod_to_feature.get(*mod_name) { + for (i, &val) in feat.iter().enumerate() { + mod_x[site * mod_feature_size + i] += val; + } + } else { + log::warn!("Unknown modification feature: {}", mod_name); + } + } + + Ok(Tensor::from_slice(&mod_x, (1, seq_len, mod_feature_size), device) + .map_err(|e| anyhow!("Failed to create tensor: {}", e))?) +} + + +#[cfg(test)] +mod tests { + + use crate::utils::peptdeep_utils::load_mod_to_feature; + use crate::utils::peptdeep_utils::parse_model_constants; + use crate::utils::peptdeep_utils::ModelConstants; + + use super::*; + use candle_core::Device; + use candle_core::Tensor; + use ndarray::Array2; + use std::collections::HashMap; + use std::path::PathBuf; + + #[test] + fn test_get_aa_indices() { + let seq = "AGHCEWQMKYR"; + let result = get_aa_indices(seq).unwrap(); + // expected result is [[0, 1, 7, 8, 3, 5, 23, 17, 13, 11, 25, 18, 0]] + let expect_out = Array2::from_shape_vec((1, 13), vec![0, 1, 7, 8, 3, 5, 23, 17, 13, 11, 25, 18, 0]).unwrap(); + println!("{:?} - aa_indices: {:?}", seq, result); + assert_eq!(result.shape(), &[1, 13]); + assert_eq!(result, expect_out); + } + + #[test] + fn test_aa_indices_tensor(){ + let device = Device::Cpu; + let seq = "AGHCEWQMKYR"; + let result = aa_indices_tensor(seq, &device).unwrap(); + // expected result is [[0, 1, 7, 8, 3, 5, 23, 17, 13, 11, 25, 18, 0]] + let expect_out = Tensor::from_vec(vec!{0.0f32, 1.0f32, 7.0f32, 8.0f32, 3.0f32, 5.0f32, 23.0f32, 17.0f32, 13.0f32, 11.0f32, 25.0f32, 18.0f32, 0.0f32}, (1, 13), &device).unwrap(); + println!("{:?} - aa_indices_tensor: {:?}", seq, result.to_vec3::().unwrap()); + println!("result shape: {:?}", result.shape()); + assert_eq!(result.shape().dims(), &[1, 13, 1]); + // assert_eq!(result.to_vec3::().unwrap(), expect_out.to_vec3::().unwrap()); + } + + #[test] + fn test_get_mod_features() { + let mods = "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M"; + let mod_sites = "0;4;8"; + let seq_len = 11 + 2; + let mod_feature_size = 109; + + let constants_path = + PathBuf::from("data/models/alphapeptdeep/generic/rt.pth.model_const.yaml"); + let constants: ModelConstants = + parse_model_constants(constants_path.to_str().unwrap()).unwrap(); + let mod_to_feature: HashMap> = load_mod_to_feature(&constants).unwrap(); + + let device = Device::Cpu; + let tensor = get_mod_features( + mods, + mod_sites, + seq_len, + mod_feature_size, + mod_to_feature, + device, + ).unwrap(); + println!("tensor shape: {:?}", tensor.shape()); + assert_eq!(tensor.shape().dims(), &[1, seq_len, mod_feature_size]); + } + + #[test] + fn test_get_mod_features_from_parsed() { + let mods_str = "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M"; + let sites_str = "0;4;8"; + + // Manually parse and split + let mod_names: Vec<&str> = mods_str.split(';').filter(|s| !s.is_empty()).collect(); + let mod_sites: Vec = sites_str + .split(';') + .filter(|s| !s.is_empty()) + .map(|s| s.parse::().unwrap()) + .collect(); + let seq_len = 11 + 2; + let mod_feature_size = 109; + + let constants_path = + PathBuf::from("data/models/alphapeptdeep/generic/rt.pth.model_const.yaml"); + let constants: ModelConstants = + parse_model_constants(constants_path.to_str().unwrap()).unwrap(); + let mod_to_feature: HashMap> = load_mod_to_feature(&constants).unwrap(); + + let device = Device::Cpu; + let tensor = get_mod_features_from_parsed( + &mod_names, + &mod_sites, + seq_len, + mod_feature_size, + &mod_to_feature, + &device, + ).unwrap(); + + println!("tensor shape: {:?}", tensor.shape()); + + assert_eq!(tensor.shape().dims(), &[1, seq_len, mod_feature_size]); + + } } \ No newline at end of file diff --git a/crates/redeem-properties/src/models/model_interface.rs b/crates/redeem-properties/src/models/model_interface.rs index 1149a23..73070b5 100644 --- a/crates/redeem-properties/src/models/model_interface.rs +++ b/crates/redeem-properties/src/models/model_interface.rs @@ -1,5 +1,5 @@ use crate::{ - building_blocks::featurize::{self, get_aa_indices, get_mod_features}, + building_blocks::featurize::{self, aa_indices_tensor, get_aa_indices, get_mod_features, get_mod_features_from_parsed}, models::{ccs_model::CCSModelWrapper, ms2_model::MS2ModelWrapper, rt_model::RTModelWrapper}, utils::{ data_handling::PeptideData, @@ -19,6 +19,7 @@ use std::ops::{Index, IndexMut}; use std::path::Path; use std::sync::{Arc, Mutex}; use std::{collections::HashMap, path::PathBuf}; +use itertools::izip; // Constants const CHARGE_FACTOR: f64 = 0.1; @@ -274,98 +275,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { } } - /// Encode a batch of peptide sequences (plus modifications) into a tensor. - /// - /// # Arguments - /// * `peptide_sequences` - A vector of peptide sequences. - /// * `mods` - A vector of strings representing the modifications for each peptide. - /// * `mod_sites` - A vector of strings representing the modification site indices for each peptide. - /// * `charge` - An optional vector of charge states for each peptide. - /// * `nce` - An optional vector of nominal collision energies for each peptide. - /// * `instruments` - An optional vector of instrument names for each peptide. - /// - /// # Returns - /// A tensor containing the encoded peptide sequences. - fn encode_peptides( - &self, - peptide_sequences: &[String], - mods: &[String], - mod_sites: &[String], - charges: Option>, - nces: Option>, - instruments: Option>, - ) -> Result { - if peptide_sequences.len() != mods.len() || peptide_sequences.len() != mod_sites.len() { - return Err(anyhow::anyhow!( - "Mismatch in input lengths: peptide_sequences, mods, and mod_sites must have the same length." - )); - } - - // Encode peptides in parallel using Rayon - let encoded_tensors: Vec = peptide_sequences - .par_iter() // Use Rayon's parallel iterator - .enumerate() - .map(|(i, peptide)| { - self.encode_peptide( - peptide, - &mods[i], - &mod_sites[i], - charges.as_ref().map(|c| c[i]), - nces.as_ref().map(|n| n[i]), - instruments.as_ref().map(|ins| ins[i].as_str()), - ) - }) - .collect::>>()?; // Collect results and propagate errors if any - - // Determine the maximum sequence length - let max_seq_len = encoded_tensors - .par_iter() - .map(|t| t.shape().dims3().unwrap().1) // Get sequence length (dimension 1) - .max() - .unwrap_or(0); - - // Pad tensors to the max_seq_len - let padded_tensors: Result> = encoded_tensors - .into_par_iter() // Use Rayon's parallel iterator - .map(|t| { - let (_, seq_len, feature_size) = t.shape().dims3()?; // Extract feature dimension - if seq_len < max_seq_len { - let pad_size = max_seq_len - seq_len; - // Create a padding tensor with the correct shape and type - let pad = Tensor::zeros( - &[1, pad_size, feature_size], // Use the correct feature dimension - t.dtype(), - t.device(), - )?; - // Concatenate padding along sequence length - Tensor::cat(&[&t, &pad], 1) - } else { - Ok(t) - } - }) - .collect::, _>>() - .map_err(Into::into); - - let padded_tensors = padded_tensors?; - - // Concatenate all padded tensors along the batch dimension - let batch_tensor = Tensor::cat(&padded_tensors, 0)?; - - Ok(batch_tensor) - } - /// Encode peptide sequence (plus modifications) into a tensor. - /// - /// # Arguments - /// * `peptide_sequence` - The peptide sequence. - /// * `mods` - A string representing the modifications for the peptide. - /// * `mod_sites` - A string representing the modification site indices for the peptide. - /// * `charge` - An optional charge state for the peptide. - /// * `nce` - An optional nominal collision energy for the peptide. - /// * `instrument` - An optional instrument name for the peptide. - /// - /// # Returns - /// A tensor containing the encoded peptide sequence. fn encode_peptide( &self, peptide_sequence: &str, @@ -375,90 +285,297 @@ pub trait ModelInterface: Send + Sync + ModelClone { nce: Option, instrument: Option<&str>, ) -> Result { - log::trace!( - "[ModelInterface::encode_peptide] Encoding peptide: {:?}, mods: {:?}, mod_sites: {:?}, charge: {:?}, nce: {:?}, instrument: {:?}", - peptide_sequence, - mods, - mod_sites, - charge, - nce, - instrument - ); - let aa_indices = get_aa_indices(peptide_sequence)?; - log::trace!( - "[ModelInterface::encode_peptide] aa_indices_tensor shape: {:?}, min: {:?}, max: {:?}", - aa_indices.shape(), - aa_indices.iter().min(), - aa_indices.iter().max() - ); - - // Convert ndarray to Tensor (F32) - let aa_indices_tensor = Tensor::from_slice( - &aa_indices.as_slice().unwrap(), - (aa_indices.shape()[0], aa_indices.shape()[1]), - &self.get_device(), - )? - .to_dtype(DType::F32)?; - - let (batch_size, seq_len) = aa_indices_tensor.shape().dims2()?; - let aa_indices_tensor = aa_indices_tensor.unsqueeze(2)?; // Shape: batch_size x seq_len x 1 - - log::trace!( - "[ModelInterface::encode_peptide] aa_indices_tensor shape: {:?}, min: {:?}, max: {:?}", - aa_indices_tensor.shape(), - aa_indices_tensor.min_all(), - aa_indices_tensor.max_all() - ); - - // Get modification features - let mod_x = get_mod_features( - mods, - mod_sites, + let device = self.get_device(); + let mod_feature_size = self.get_mod_element_count(); + let mod_to_feature = self.get_mod_to_feature().clone(); + + let aa_tensor = aa_indices_tensor(peptide_sequence, &device)?; + let (batch_size, seq_len, _) = aa_tensor.shape().dims3()?; + + let mod_names: Vec<&str> = mods.split(';').filter(|s| !s.is_empty()).collect(); + let mod_indices: Vec = mod_sites + .split(';') + .filter(|s| !s.is_empty()) + .map(|s| s.parse::().unwrap()) + .collect(); + + let mod_tensor = get_mod_features_from_parsed( + &mod_names, + &mod_indices, seq_len, - self.get_mod_element_count(), - self.get_mod_to_feature().clone(), - self.get_device().clone(), + mod_feature_size, + &mod_to_feature, + &device, )?; - let mut features = vec![aa_indices_tensor, mod_x]; + let mut features = vec![aa_tensor, mod_tensor]; - // Conditionally add charge if let Some(c) = charge { let charge_tensor = Tensor::from_slice( &vec![c as f64 * CHARGE_FACTOR; seq_len], &[batch_size, seq_len, 1], - &self.get_device(), - )? - .to_dtype(DType::F32)?; + &device, + )?.to_dtype(DType::F32)?; features.push(charge_tensor); } - // Conditionally add NCE if let Some(n) = nce { let nce_tensor = Tensor::from_slice( &vec![n as f64 * NCE_FACTOR; seq_len], &[batch_size, seq_len, 1], - &self.get_device(), - )? - .to_dtype(DType::F32)?; + &device, + )?.to_dtype(DType::F32)?; features.push(nce_tensor); } - // Conditionally add instrument if let Some(instr) = instrument { - let instrument_tensor = Tensor::from_slice( - &vec![parse_instrument_index(instr) as u32; seq_len], + let instr_idx = parse_instrument_index(instr) as u32; + let instr_tensor = Tensor::from_slice( + &vec![instr_idx; seq_len], &[batch_size, seq_len, 1], - &self.get_device(), - )? - .to_dtype(DType::F32)?; - features.push(instrument_tensor); + &device, + )?.to_dtype(DType::F32)?; + features.push(instr_tensor); } - // Concatenate features Ok(Tensor::cat(&features, 2)?) } + /// Encode a batch of peptide sequences into a tensor + fn encode_peptides( + &self, + peptide_sequences: &[String], + mods: &[String], + mod_sites: &[String], + charges: Option>, + nces: Option>, + instruments: Option>, + ) -> Result { + let len = peptide_sequences.len(); + + let tensors: Vec<_> = (0..len) + .into_par_iter() + .map(|i| { + self.encode_peptide( + &peptide_sequences[i], + &mods[i], + &mod_sites[i], + charges.as_ref().map(|v| v[i]), + nces.as_ref().map(|v| v[i]), + instruments.as_ref().map(|v| v[i].as_str()), + ) + }) + .collect::, _>>()?; // Propagate errors + + let max_len = tensors + .iter() + .map(|t| t.shape().dims3().unwrap().1) + .max() + .unwrap_or(0); + + let padded = tensors + .into_par_iter() + .map(|t| { + let (_, seq_len, feat_dim) = t.shape().dims3()?; + if seq_len < max_len { + let pad = Tensor::zeros(&[1, max_len - seq_len, feat_dim], t.dtype(), t.device())?; + Tensor::cat(&[&t, &pad], 1) + } else { + Ok(t) + } + }) + .map(|res| res.map_err(anyhow::Error::from)) + .collect::, _>>()?; + + Ok(Tensor::cat(&padded, 0)?) + } + + + // /// Encode a batch of peptide sequences (plus modifications) into a tensor. + // /// + // /// # Arguments + // /// * `peptide_sequences` - A vector of peptide sequences. + // /// * `mods` - A vector of strings representing the modifications for each peptide. + // /// * `mod_sites` - A vector of strings representing the modification site indices for each peptide. + // /// * `charge` - An optional vector of charge states for each peptide. + // /// * `nce` - An optional vector of nominal collision energies for each peptide. + // /// * `instruments` - An optional vector of instrument names for each peptide. + // /// + // /// # Returns + // /// A tensor containing the encoded peptide sequences. + // fn encode_peptides( + // &self, + // peptide_sequences: &[String], + // mods: &[String], + // mod_sites: &[String], + // charges: Option>, + // nces: Option>, + // instruments: Option>, + // ) -> Result { + // if peptide_sequences.len() != mods.len() || peptide_sequences.len() != mod_sites.len() { + // return Err(anyhow::anyhow!( + // "Mismatch in input lengths: peptide_sequences, mods, and mod_sites must have the same length." + // )); + // } + + // // Encode peptides in parallel using Rayon + // let encoded_tensors: Vec = peptide_sequences + // .par_iter() // Use Rayon's parallel iterator + // .enumerate() + // .map(|(i, peptide)| { + // self.encode_peptide( + // peptide, + // &mods[i], + // &mod_sites[i], + // charges.as_ref().map(|c| c[i]), + // nces.as_ref().map(|n| n[i]), + // instruments.as_ref().map(|ins| ins[i].as_str()), + // ) + // }) + // .collect::>>()?; // Collect results and propagate errors if any + + // // Determine the maximum sequence length + // let max_seq_len = encoded_tensors + // .par_iter() + // .map(|t| t.shape().dims3().unwrap().1) // Get sequence length (dimension 1) + // .max() + // .unwrap_or(0); + + // // Pad tensors to the max_seq_len + // let padded_tensors: Result> = encoded_tensors + // .into_par_iter() // Use Rayon's parallel iterator + // .map(|t| { + // let (_, seq_len, feature_size) = t.shape().dims3()?; // Extract feature dimension + // if seq_len < max_seq_len { + // let pad_size = max_seq_len - seq_len; + // // Create a padding tensor with the correct shape and type + // let pad = Tensor::zeros( + // &[1, pad_size, feature_size], // Use the correct feature dimension + // t.dtype(), + // t.device(), + // )?; + // // Concatenate padding along sequence length + // Tensor::cat(&[&t, &pad], 1) + // } else { + // Ok(t) + // } + // }) + // .collect::, _>>() + // .map_err(Into::into); + + // let padded_tensors = padded_tensors?; + + // // Concatenate all padded tensors along the batch dimension + // let batch_tensor = Tensor::cat(&padded_tensors, 0)?; + + // Ok(batch_tensor) + // } + + // /// Encode peptide sequence (plus modifications) into a tensor. + // /// + // /// # Arguments + // /// * `peptide_sequence` - The peptide sequence. + // /// * `mods` - A string representing the modifications for the peptide. + // /// * `mod_sites` - A string representing the modification site indices for the peptide. + // /// * `charge` - An optional charge state for the peptide. + // /// * `nce` - An optional nominal collision energy for the peptide. + // /// * `instrument` - An optional instrument name for the peptide. + // /// + // /// # Returns + // /// A tensor containing the encoded peptide sequence. + // fn encode_peptide( + // &self, + // peptide_sequence: &str, + // mods: &str, + // mod_sites: &str, + // charge: Option, + // nce: Option, + // instrument: Option<&str>, + // ) -> Result { + // log::trace!( + // "[ModelInterface::encode_peptide] Encoding peptide: {:?}, mods: {:?}, mod_sites: {:?}, charge: {:?}, nce: {:?}, instrument: {:?}", + // peptide_sequence, + // mods, + // mod_sites, + // charge, + // nce, + // instrument + // ); + // let aa_indices = get_aa_indices(peptide_sequence)?; + // log::trace!( + // "[ModelInterface::encode_peptide] aa_indices_tensor shape: {:?}, min: {:?}, max: {:?}", + // aa_indices.shape(), + // aa_indices.iter().min(), + // aa_indices.iter().max() + // ); + + // // Convert ndarray to Tensor (F32) + // let aa_indices_tensor = Tensor::from_slice( + // &aa_indices.as_slice().unwrap(), + // (aa_indices.shape()[0], aa_indices.shape()[1]), + // &self.get_device(), + // )? + // .to_dtype(DType::F32)?; + + // let (batch_size, seq_len) = aa_indices_tensor.shape().dims2()?; + // let aa_indices_tensor = aa_indices_tensor.unsqueeze(2)?; // Shape: batch_size x seq_len x 1 + + // log::trace!( + // "[ModelInterface::encode_peptide] aa_indices_tensor shape: {:?}, min: {:?}, max: {:?}", + // aa_indices_tensor.shape(), + // aa_indices_tensor.min_all(), + // aa_indices_tensor.max_all() + // ); + + // // Get modification features + // let mod_x = get_mod_features( + // mods, + // mod_sites, + // seq_len, + // self.get_mod_element_count(), + // self.get_mod_to_feature().clone(), + // self.get_device().clone(), + // )?; + + // let mut features = vec![aa_indices_tensor, mod_x]; + + // // Conditionally add charge + // if let Some(c) = charge { + // let charge_tensor = Tensor::from_slice( + // &vec![c as f64 * CHARGE_FACTOR; seq_len], + // &[batch_size, seq_len, 1], + // &self.get_device(), + // )? + // .to_dtype(DType::F32)?; + // features.push(charge_tensor); + // } + + // // Conditionally add NCE + // if let Some(n) = nce { + // let nce_tensor = Tensor::from_slice( + // &vec![n as f64 * NCE_FACTOR; seq_len], + // &[batch_size, seq_len, 1], + // &self.get_device(), + // )? + // .to_dtype(DType::F32)?; + // features.push(nce_tensor); + // } + + // // Conditionally add instrument + // if let Some(instr) = instrument { + // let instrument_tensor = Tensor::from_slice( + // &vec![parse_instrument_index(instr) as u32; seq_len], + // &[batch_size, seq_len, 1], + // &self.get_device(), + // )? + // .to_dtype(DType::F32)?; + // features.push(instrument_tensor); + // } + + // // Concatenate features + // Ok(Tensor::cat(&features, 2)?) + // } + /// Fine-tune the model on a batch of training data. /// /// # Arguments diff --git a/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs b/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs index 3d5abc9..51bfa11 100644 --- a/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs +++ b/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs @@ -308,13 +308,86 @@ mod tests { assert_eq!(constants.nce_factor, Some(0.01)); } + #[test] + fn test_encode_peptides() { + let model_path = PathBuf::from("data/models/alphapeptdeep/generic/rt.pth"); + let constants_path = + PathBuf::from("data/models/alphapeptdeep/generic/rt.pth.model_const.yaml"); + let device = Device::Cpu; + let model = RTCNNLSTMModel::new(&model_path, &constants_path, 0, 8, 4, true, device).unwrap(); + + let peptide_sequences = "AGHCEWQMKYR"; + let mods = "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M"; + let mod_sites = "0;4;8"; + // let charge = Some(2); + // let nce = Some(20); + // let instrument = Some("QE"); + + let result = + model.encode_peptide(&peptide_sequences, mods, mod_sites, None, None, None); + + println!("{:?}", result); + + // assert!(result.is_ok()); + // let encoded_peptides = result.unwrap(); + // assert_eq!(encoded_peptides.shape().dims2().unwrap(), (1, 27 + 109 + 1 + 1 + 1)); + } + + #[test] + fn test_encode_peptides_batch() { + + let model_path = PathBuf::from("data/models/alphapeptdeep/generic/rt.pth"); + let constants_path = PathBuf::from("data/models/alphapeptdeep/generic/rt.pth.model_const.yaml"); + let device = Device::Cpu; + + let model = RTCNNLSTMModel::new(&model_path, &constants_path, 0, 8, 4, true, device.clone()).unwrap(); + + // Batched input + let peptide_sequences = vec![ + "ACDEFGHIK".to_string(), + "AGHCEWQMKYR".to_string(), + ]; + let mods = vec![ + "Carbamidomethyl@C".to_string(), + "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M".to_string(), + ]; + let mod_sites = vec![ + "1".to_string(), + "0;4;8".to_string(), + ]; + + println!("Peptides: {:?}", peptide_sequences); + println!("Mods: {:?}", mods); + println!("Mod sites: {:?}", mod_sites); + + + let result = model.encode_peptides( + &peptide_sequences, + &mods, + &mod_sites, + None, + None, + None, + ); + + assert!(result.is_ok()); + let tensor = result.unwrap(); + println!("Batched encoded tensor shape: {:?}", tensor.shape()); + + let (batch, seq_len, feat_dim) = tensor.shape().dims3().unwrap(); + assert_eq!(batch, 2); // two peptides + assert!(seq_len >= 11); // padded to max length + assert!(feat_dim > 1); // includes aa + mod features + } + + #[test] fn test_prediction() { let model_path = PathBuf::from("data/models/alphapeptdeep/generic/rt.pth"); let constants_path = PathBuf::from("data/models/alphapeptdeep/generic/rt.pth.model_const.yaml"); let device = /* Assuming Device is defined */ Device::new_cuda(0).unwrap_or(/* assuming Device::Cpu is defined */ Device::Cpu); // Replace with actual Device code. - let result = /* Assuming RTCNNLSTMModel is defined */ RTCNNLSTMModel::new(&model_path, &constants_path, 0, 8, 4, true, device); // Replace with actual RTCNNLSTMModel code + let result = RTCNNLSTMModel::new(&model_path, &constants_path, 0, 8, 4, true, device); let mut model = result.unwrap(); // Test prediction with a few peptides after fine-tuning From 1086bd6e16b82cd476160740c150715edb096d95 Mon Sep 17 00:00:00 2001 From: singjc Date: Thu, 8 May 2025 15:11:31 -0400 Subject: [PATCH 09/75] chore: Update dependencies in redeem-properties crate --- crates/redeem-properties/Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/redeem-properties/Cargo.toml b/crates/redeem-properties/Cargo.toml index 304a370..e39b7c5 100644 --- a/crates/redeem-properties/Cargo.toml +++ b/crates/redeem-properties/Cargo.toml @@ -17,6 +17,7 @@ serde_yaml = "0.9" ndarray = "0.15" #ndarray = "0.16.1" reqwest = { version = "0.11", features = ["blocking"] } +itertools = "0.14.0" zip = "2.2.2" csv = "1.1" regex = "1.6" From c13dabd933ba271894ca63977f3b55bbf93ecd0b Mon Sep 17 00:00:00 2001 From: singjc Date: Fri, 9 May 2025 00:40:02 -0400 Subject: [PATCH 10/75] refactor: bilstm --- .../src/building_blocks/bilstm.rs | 242 ++++++------------ 1 file changed, 77 insertions(+), 165 deletions(-) diff --git a/crates/redeem-properties/src/building_blocks/bilstm.rs b/crates/redeem-properties/src/building_blocks/bilstm.rs index 0e04749..b6dd1aa 100644 --- a/crates/redeem-properties/src/building_blocks/bilstm.rs +++ b/crates/redeem-properties/src/building_blocks/bilstm.rs @@ -1,6 +1,6 @@ -use candle_core::{DType, Device, Result, Tensor}; +use candle_core::{IndexOp, Result, Tensor}; use candle_nn::{rnn, Module, VarBuilder, RNN}; -// use crate::utils::logging::print_tensor; + #[derive(Debug, Clone)] pub struct BidirectionalLSTM { @@ -16,72 +16,26 @@ pub struct BidirectionalLSTM { } impl BidirectionalLSTM { - pub fn new( input_size: usize, hidden_size: usize, num_layers: usize, vb: &VarBuilder, ) -> Result { - let h0 = vb.get((num_layers * 2, 1, hidden_size), "rnn_h0")?; let c0 = vb.get((num_layers * 2, 1, hidden_size), "rnn_c0")?; - let lstm_config = rnn::LSTMConfig { - layer_idx: 0, - direction: rnn::Direction::Forward, - ..Default::default() - }; - - let lstm_config_rev = rnn::LSTMConfig { - layer_idx: 0, - direction: rnn::Direction::Backward, - ..Default::default() - }; - - let forward_lstm1 = rnn::lstm( - input_size, - hidden_size, - lstm_config.clone(), - vb.pp("rnn").clone() - )?; - let backward_lstm1 = rnn::lstm( - input_size, - hidden_size, - lstm_config_rev.clone(), - vb.pp("rnn").clone() - )?; - - let lstm_config2 = rnn::LSTMConfig { - layer_idx: 1, - direction: rnn::Direction::Forward, - ..Default::default() - }; - - let lstm_config2_rev = rnn::LSTMConfig { - layer_idx: 1, - direction: rnn::Direction::Backward, - ..Default::default() - }; - - let forward_lstm2 = rnn::lstm( - 2 * hidden_size, - hidden_size, - lstm_config2.clone(), - vb.pp("rnn").clone() - )?; - let backward_lstm2 = rnn::lstm( - 2 * hidden_size, - hidden_size, - lstm_config2_rev.clone(), - vb.pp("rnn").clone() - )?; + let lstm1_fw = rnn::lstm(input_size, hidden_size, rnn::LSTMConfig::default(), vb.pp("rnn"))?; + let lstm1_bw = rnn::lstm(input_size, hidden_size, rnn::LSTMConfig { direction: rnn::Direction::Backward, ..Default::default() }, vb.pp("rnn"))?; + + let lstm2_fw = rnn::lstm(2 * hidden_size, hidden_size, rnn::LSTMConfig { layer_idx: 1, ..Default::default() }, vb.pp("rnn"))?; + let lstm2_bw = rnn::lstm(2 * hidden_size, hidden_size, rnn::LSTMConfig { layer_idx: 1, direction: rnn::Direction::Backward, ..Default::default() }, vb.pp("rnn"))?; Ok(Self { - forward_lstm1, - backward_lstm1, - forward_lstm2, - backward_lstm2, + forward_lstm1: lstm1_fw, + backward_lstm1: lstm1_bw, + forward_lstm2: lstm2_fw, + backward_lstm2: lstm2_bw, h0, c0, input_size, @@ -90,125 +44,88 @@ impl BidirectionalLSTM { }) } - - fn apply_bidirectional_layer(&self, input: &Tensor, lstm_forward: &rnn::LSTM, lstm_backward: &rnn::LSTM, h0: &Tensor, c0: &Tensor, layer_idx: &i32) -> Result<(Tensor, (Tensor, Tensor))> { - let (batch_size, seq_len, input_size) = input.dims3()?; + fn apply_bidirectional_layer( + &self, + input: &Tensor, + lstm_forward: &rnn::LSTM, + lstm_backward: &rnn::LSTM, + h0: &Tensor, + c0: &Tensor, + ) -> Result<(Tensor, (Tensor, Tensor))> { + let (_batch_size, seq_len, _input_size) = input.dims3()?; - // Print first and last 5 values of the original input - let input_vec = input.to_vec3::()?; + // Initial states for forward + let h0_forward = h0.i(0)?; + let c0_forward = c0.i(0)?; + let state_fw = rnn::LSTMState { h: h0_forward, c: c0_forward }; - // Forward pass - let h0_forward = h0.narrow(0, 0, 1)?.reshape((batch_size, h0.dim(2)?))?; - let c0_forward = c0.narrow(0, 0, 1)?.reshape((batch_size, c0.dim(2)?))?; - - let state_forward = rnn::LSTMState{ h: h0_forward.clone(), c: c0_forward.clone() }; - - let output_forward_states: Vec = lstm_forward.seq_init(&input, &state_forward)?; - let output_forward = Tensor::stack(&output_forward_states.iter().map(|state| state.h().clone()).collect::>(), 1)?; - let last_forward_state = output_forward_states.last().unwrap().h().clone(); - - // Backward pass - let h0_backward = h0.narrow(0, 1, 1)?.reshape((batch_size, h0.dim(2)?))?; - let c0_backward = c0.narrow(0, 1, 1)?.reshape((batch_size, c0.dim(2)?))?; - - let state_backward = rnn::LSTMState{ h: h0_backward.clone(), c: c0_backward.clone() }; - - // Correctly reverse the input sequence - let mut reversed_input = vec![vec![vec![0.0; input_size]; seq_len]; batch_size]; - for b in 0..batch_size { - for t in 0..seq_len { - for i in 0..input_size { - reversed_input[b][seq_len - t - 1][i] = input_vec[b][t][i]; - } - } - } - let input_reversed = Tensor::new(reversed_input, input.device())? - .to_dtype(DType::F32)? - .reshape((batch_size, seq_len, input_size))?; - - // Print first and last 5 values of the reversed input - let reversed_input_vec = input_reversed.to_vec3::()?; - + let start_time = std::time::Instant::now(); + let out_fw_states = lstm_forward.seq_init(input, &state_fw)?; + let out_fw = Tensor::stack( + &out_fw_states.iter().map(|s| s.h()).collect::>(), + 1, + )?; + let last_fw_h = out_fw_states.last().unwrap().h().clone(); + let last_fw_c = out_fw_states.last().unwrap().c().clone(); + println!("BidirectionLSTM::apply_bidirectional_layer - Forward LSTM time: {:?}", start_time.elapsed()); - let output_backward_states = lstm_backward.seq_init(&input_reversed, &state_backward)?; - let output_backward = Tensor::stack(&output_backward_states.iter().map(|state| state.h().clone()).collect::>(), 1)?; - - // Use the last state of the backward LSTM (which corresponds to the first element of the original sequence) - let last_backward_state = output_backward_states.last().unwrap().h().clone(); + // Reverse sequence + let start_time = std::time::Instant::now(); + let input_reversed = Tensor::cat( + &(0..seq_len) + .rev() + .map(|t| input.i((.., t..=t, ..))) + .collect::>>()?, + 1, + )?; + println!("BidirectionLSTM::apply_bidirectional_layer - Reverse sequence time: {:?}", start_time.elapsed()); + + // Initial states for backward + let h0_backward = h0.i(1)?; + let c0_backward = c0.i(1)?; + let state_bw = rnn::LSTMState { h: h0_backward, c: c0_backward }; - // Combine the forward and backward hidden states for hn - let hn = Tensor::cat(&[last_forward_state.unsqueeze(0)?, last_backward_state.unsqueeze(0)?], 0)?; // Shape: [2, 1, 128] - let hn_concat = Tensor::cat(&[last_forward_state, last_backward_state], 1)?; // Shape: [1, 256] - - // Combine the forward and backwards cell states for cn - let cn = Tensor::cat(&[output_forward_states.last().unwrap().c().clone(), output_backward_states.last().unwrap().c().clone()], 0)?; // Shape: [2, 1, 128] + let start_time = std::time::Instant::now(); + let out_bw_states = lstm_backward.seq_init(&input_reversed, &state_bw)?; + let out_bw = Tensor::stack( + &out_bw_states.iter().map(|s| s.h()).collect::>(), + 1, + )?; + let last_bw_h = out_bw_states.last().unwrap().h().clone(); + let last_bw_c = out_bw_states.last().unwrap().c().clone(); + println!("BidirectionLSTM::apply_bidirectional_layer - Backward LSTM time: {:?}", start_time.elapsed()); - // The output_backward is already in the correct order for the original sequence - let output = Tensor::cat(&[output_forward, output_backward], 2)?; // Shape: [1, 13, 256] + // Combine hidden and cell states + let hn = Tensor::stack(&[last_fw_h.clone(), last_bw_h.clone()], 0)?; + let cn = Tensor::stack(&[last_fw_c, last_bw_c], 0)?; + let output = Tensor::cat(&[out_fw, out_bw], 2)?; Ok((output, (hn, cn))) } - // New method that returns output and states - pub fn forward_with_state(&self, xs: &Tensor) -> Result<(Tensor, (Tensor, Tensor))> { - let (batch_size, seq_len, input_size) = xs.dims3()?; - let h0 = &self.h0.expand((self.num_layers * 2, batch_size, self.hidden_size))?; - let c0 = &self.c0.expand((self.num_layers * 2, batch_size, self.hidden_size))?; + /// Forward with hidden states returned + pub fn forward_with_state(&self, xs: &Tensor) -> Result<(Tensor, (Tensor, Tensor))> { + let (batch_size, _, _) = xs.dims3()?; + let h0 = self.h0.expand((self.num_layers * 2, batch_size, self.hidden_size))?; + let c0 = self.c0.expand((self.num_layers * 2, batch_size, self.hidden_size))?; let h0_1 = h0.narrow(0, 0, 2)?; - let h0_2 = h0.narrow(0, 2, 2)?; let c0_1 = c0.narrow(0, 0, 2)?; + let h0_2 = h0.narrow(0, 2, 2)?; let c0_2 = c0.narrow(0, 2, 2)?; - let (layer1_output, (hn1, cn1)) = self.apply_bidirectional_layer(xs, &self.forward_lstm1, &self.backward_lstm1, &h0_1, &c0_1, &1)?; - let (layer2_output, (hn2, cn2)) = self.apply_bidirectional_layer(&layer1_output, &self.forward_lstm2, &self.backward_lstm2, &h0_2, &c0_2, &2)?; + let start_time = std::time::Instant::now(); + let (out1, (hn1, cn1)) = self.apply_bidirectional_layer(xs, &self.forward_lstm1, &self.backward_lstm1, &h0_1, &c0_1)?; + println!("BidirectionLSTM::forward_with_state - Layer 1 time: {:?}", start_time.elapsed()); + let start_time = std::time::Instant::now(); + let (out2, (hn2, cn2)) = self.apply_bidirectional_layer(&out1, &self.forward_lstm2, &self.backward_lstm2, &h0_2, &c0_2)?; + println!("BidirectionLSTM::forward_with_state - Layer 2 time: {:?}", start_time.elapsed()); - let final_hn = Tensor::cat(&[hn1, hn2], 0)?; - let final_cn = Tensor::cat(&[cn1, cn2], 0)?; - - Ok((layer2_output, (final_hn, final_cn))) - } - - - /// Print the weights of the BiLSTM - pub fn print_weights(&self, vb: &VarBuilder) -> Result<()> { - fn print_first_few(tensor: &Tensor, name: &str) -> Result<()> { - let flattened = tensor.flatten_all()?; - let num_elements = flattened.dim(0)?; - let num_to_print = 5.min(num_elements); - println!("{} shape: {:?}", name, tensor.shape()); - println!("{} (first few values): {:?}", name, flattened.narrow(0, 0, num_to_print)?.to_vec1::()?); - Ok(()) - } - - fn print_lstm_weights(vb: &VarBuilder, layer: usize, direction: &str) -> Result<()> { - let prefix = format!("rt_encoder.hidden_nn.rnn.weight_"); - let ih_name = format!("{}ih_l{}{}", prefix, layer, direction); - let hh_name = format!("{}hh_l{}{}", prefix, layer, direction); - - // println!("LSTM layer {} {} weights:", layer, direction); - if layer == 1{ - print_first_few(&vb.get((512, 256), &ih_name)?, &format!(" {}", ih_name))?; - } else { - print_first_few(&vb.get((512, 140), &ih_name)?, &format!(" {}", ih_name))?; - } - - print_first_few(&vb.get((512, 128), &hh_name)?, &format!(" {}", hh_name))?; - - Ok(()) - } - - // Print forward LSTM weights - print_lstm_weights(vb, 0, "")?; - print_lstm_weights(vb, 1, "")?; - - // Print backward LSTM weights - print_lstm_weights(vb, 0, "_reverse")?; - print_lstm_weights(vb, 1, "_reverse")?; - - Ok(()) + let hn = Tensor::cat(&[hn1, hn2], 0)?; + let cn = Tensor::cat(&[cn1, cn2], 0)?; + Ok((out2, (hn, cn))) } pub fn input_size(&self) -> usize { @@ -222,21 +139,16 @@ impl BidirectionalLSTM { pub fn num_layers(&self) -> usize { self.num_layers } - } - impl Module for BidirectionalLSTM { - - /// Forward pass of the BiLSTM fn forward(&self, xs: &Tensor) -> Result { - // This method now only returns the output tensor let (output, _) = self.forward_with_state(xs)?; Ok(output) } - } + #[cfg(test)] mod test { use super::*; From e0b19c60e7179d6883f9881c899d2d5ec8caeadf Mon Sep 17 00:00:00 2001 From: singjc Date: Fri, 9 May 2025 00:44:08 -0400 Subject: [PATCH 11/75] refactor: Optimize peptide sequence featurization and one-hot encoding --- .../src/building_blocks/featurize.rs | 259 ++++++------------ 1 file changed, 81 insertions(+), 178 deletions(-) diff --git a/crates/redeem-properties/src/building_blocks/featurize.rs b/crates/redeem-properties/src/building_blocks/featurize.rs index b9beb72..2272612 100644 --- a/crates/redeem-properties/src/building_blocks/featurize.rs +++ b/crates/redeem-properties/src/building_blocks/featurize.rs @@ -1,131 +1,11 @@ use anyhow::{Result, anyhow}; -use std::{collections::HashMap, ops::Deref}; -use ndarray::Array2; +use std::collections::HashMap; use candle_core::{DType, Device, Tensor}; +use rayon::prelude::*; +use std::sync::atomic::{AtomicU32, Ordering}; use crate::building_blocks::building_blocks::AA_EMBEDDING_SIZE; -/// Convert peptide sequences into AA ID array. -/// -/// Based on https://github.com/MannLabs/alphapeptdeep/blob/450518a39a4cd7d03db391108ec8700b365dd436/peptdeep/model/featurize.py#L88 -/// -/// Example: -/// ```rust -/// use redeem_properties::building_blocks::featurize::get_aa_indices; -/// use anyhow::Result; -/// use ndarray::Array2; -/// -/// let seq = "AGHCEWQMKYR"; -/// let result = get_aa_indices(seq).unwrap(); -/// println!("aa_indices: {:?}", result); -/// let expect_out = Array2::from_shape_vec((1, 13), vec![0, 1, 7, 8, 3, 5, 23, 17, 13, 11, 25, 18, 0]).unwrap(); -/// assert_eq!(result.shape(), &[1, 13]); -/// assert_eq!(result, expect_out); -/// ``` -pub fn get_aa_indices(seq: &str) -> Result> { - let valid_aa = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; // amino acids as defined in alphabase: https://github.com/MannLabs/alphabase/blob/main/alphabase/constants/const_files/amino_acid.tsv - let filtered_seq: String = seq.chars().filter(|c| valid_aa.contains(*c)).collect(); - - // TODO: Maybe this should be done higher up in the pipeline, and this should panic here instead. - // But for now this is done to deal with cases like: -MQPLSKL - if seq.len() != filtered_seq.len() { - log::trace!("Invalid amino acid characters found in sequence: {:?}, stripping them out to {:?}", seq, filtered_seq); - } - - let seq_len = filtered_seq.len(); - let mut result = Array2::::zeros((1, seq_len + 2)); - - for (j, c) in filtered_seq.chars().enumerate() { - let aa_index = (c as i64) - ('A' as i64) + 1; - result[[0, j + 1]] = aa_index; - } - - Ok(result) -} - -/// Convert peptide sequences into ASCII code array. -/// -/// Based on https://github.com/MannLabs/alphapeptdeep/blob/450518a39a4cd7d03db391108ec8700b365dd436/peptdeep/model/featurize.py#L115 -pub fn get_ascii_indices(peptide_sequences: &[String], device: Device) -> Result { - // println!("Peptide sequences to encode: {:?}", peptide_sequences); - let max_len = peptide_sequences.iter().map(|s| s.len()).max().unwrap_or(0) + 2; // +2 for padding - let batch_size = peptide_sequences.len(); - - let mut aa_indices = vec![0u32; batch_size * max_len]; - - for (i, peptide) in peptide_sequences.iter().enumerate() { - for (j, c) in peptide.chars().enumerate() { - aa_indices[i * max_len + j + 1] = c as u32; // +1 to skip the first padding - } - } - let aa_indices_tensor = - Tensor::from_slice(&aa_indices, (batch_size, max_len), &device)?; - Ok(aa_indices_tensor) -} - -/// One-hot encode amino acid indices and concatenate additional tensors. -pub fn aa_one_hot(aa_indices: &Tensor, cat_others: &[&Tensor]) -> Result { - let (batch_size, seq_len) = aa_indices.shape().dims2()?; - let num_classes = AA_EMBEDDING_SIZE; - - let mut one_hot_data = vec![0.0f32; batch_size * seq_len * num_classes]; - - // Iterate over the 2D tensor directly - for batch_idx in 0..batch_size { - for seq_idx in 0..seq_len { - let index = aa_indices.get(batch_idx)?.get(seq_idx)?.to_scalar::()?; - let class_idx = index.round() as usize; // Round to nearest integer and convert to usize - if class_idx < num_classes { - one_hot_data[batch_idx * seq_len * num_classes + seq_idx * num_classes + class_idx] = 1.0; - } - } - } - - // Convert the one_hot_data vector directly to a tensor - let one_hot_tensor = Tensor::from_slice(&one_hot_data, (batch_size, seq_len, num_classes), aa_indices.device()) - .map_err(|e| anyhow!("{}", e))?; - - // Concatenate additional tensors if provided - let mut output_tensor = one_hot_tensor; - - for other in cat_others { - output_tensor = Tensor::cat(&[output_tensor, other.deref().clone()], 2)?; - } - - Ok(output_tensor) -} - - -/// Get the modification features for a given set of modifications and modification sites. -/// -/// Based on https://github.com/MannLabs/alphapeptdeep/blob/450518a39a4cd7d03db391108ec8700b365dd436/peptdeep/model/featurize.py#L47 -pub fn get_mod_features(mods: &str, mod_sites: &str, seq_len: usize, mod_feature_size: usize, mod_to_feature: HashMap>, device: Device) -> Result { - let mod_names: Vec<&str> = mods.split(';').filter(|&s| !s.is_empty()).collect(); - let mod_sites: Vec = mod_sites - .split(';') - .filter(|&s| !s.is_empty()) - .map(|s| s.parse::().unwrap()) - .collect(); - - // let mod_feature_size = self.constants.mod_elements.len(); - - let mut mod_x = vec![0.0f32; seq_len * mod_feature_size]; - - for (mod_name, &site) in mod_names.iter().zip(mod_sites.iter()) { - if let Some(feat) = mod_to_feature.get(*mod_name) { - for (i, &value) in feat.iter().enumerate() { - if site < seq_len { - mod_x[site * mod_feature_size + i] += value; - } - } - // println!("Site: {}, feat: {:?}", site, feat); - } - } - - Tensor::from_slice(&mod_x, (1, seq_len, mod_feature_size), &device) - .map_err(|e| anyhow!("Failed to create tensor: {}", e)) -} - const VALID_AA: &str = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; @@ -138,7 +18,10 @@ fn aa_index_map() -> HashMap { .collect() } -/// Efficiently converts an amino acid sequence to a padded tensor of indices + +/// Convert peptide sequences into AA ID array. +/// +/// Based on https://github.com/MannLabs/alphapeptdeep/blob/450518a39a4cd7d03db391108ec8700b365dd436/peptdeep/model/featurize.py#L88 pub fn aa_indices_tensor(seq: &str, device: &Device) -> Result { let map = aa_index_map(); let filtered: Vec = seq @@ -153,7 +36,48 @@ pub fn aa_indices_tensor(seq: &str, device: &Device) -> Result { } -/// Optimized version of get_mod_features that avoids repeated parsing +/// One-hot encode amino acid indices and concatenate additional tensors. +pub fn aa_one_hot(aa_indices: &Tensor, cat_others: &[&Tensor]) -> Result { + let (batch_size, seq_len) = aa_indices.shape().dims2()?; + let num_classes = AA_EMBEDDING_SIZE; + + // Extract all indices as f32s once + let indices = aa_indices.to_vec2::()?; + + // Preallocate output buffer + let mut one_hot_data = vec![0.0f32; batch_size * seq_len * num_classes]; + + // Use parallel iterator for speed + one_hot_data + .par_chunks_mut(seq_len * num_classes) + .zip(indices.par_iter()) + .for_each(|(chunk, row)| { + for (seq_idx, &fidx) in row.iter().enumerate() { + let class_idx = fidx.round() as usize; + if class_idx < num_classes { + chunk[seq_idx * num_classes + class_idx] = 1.0; + } + } + }); + + let one_hot_tensor = Tensor::from_slice(&one_hot_data, (batch_size, seq_len, num_classes), aa_indices.device()) + .map_err(|e| anyhow!("Failed to create one-hot tensor: {}", e))?; + + // Concatenate with additional tensors + if cat_others.is_empty() { + Ok(one_hot_tensor) + } else { + let mut features = vec![one_hot_tensor]; + features.extend(cat_others.iter().cloned().cloned()); + Ok(Tensor::cat(&features, 2)?) + } +} + + + +/// Get the modification features for a given set of modifications and modification sites. +/// +/// Based on https://github.com/MannLabs/alphapeptdeep/blob/450518a39a4cd7d03db391108ec8700b365dd436/peptdeep/model/featurize.py#L47 pub fn get_mod_features_from_parsed( mod_names: &[&str], mod_sites: &[usize], @@ -162,24 +86,41 @@ pub fn get_mod_features_from_parsed( mod_to_feature: &HashMap>, device: &Device, ) -> Result { - let mut mod_x = vec![0.0f32; seq_len * mod_feature_size]; + // Initialize buffer with atomic wrappers + let atomic_buffer: Vec = (0..seq_len * mod_feature_size) + .map(|_| AtomicU32::new(0)) + .collect(); - for (mod_name, &site) in mod_names.iter().zip(mod_sites.iter()) { - if site >= seq_len { - log::warn!("Skipping mod {} at invalid site {} (seq_len {})", mod_name, site, seq_len); - continue; - } - if let Some(feat) = mod_to_feature.get(*mod_name) { - for (i, &val) in feat.iter().enumerate() { - mod_x[site * mod_feature_size + i] += val; + mod_names + .par_iter() + .zip(mod_sites.par_iter()) + .for_each(|(&mod_name, &site)| { + if site >= seq_len { + log::warn!( + "Skipping mod {} at invalid site {} (seq_len {})", + mod_name, site, seq_len + ); + return; } - } else { - log::warn!("Unknown modification feature: {}", mod_name); - } - } + if let Some(feat) = mod_to_feature.get(mod_name) { + for (i, &val) in feat.iter().enumerate() { + let idx = site * mod_feature_size + i; + let val_bits = val.to_bits(); + atomic_buffer[idx].fetch_add(val_bits, Ordering::Relaxed); + } + } else { + log::warn!("Unknown modification feature: {}", mod_name); + } + }); - Ok(Tensor::from_slice(&mod_x, (1, seq_len, mod_feature_size), device) - .map_err(|e| anyhow!("Failed to create tensor: {}", e))?) + // Convert atomic buffer back to f32 + let mod_x: Vec = atomic_buffer + .into_iter() + .map(|a| f32::from_bits(a.load(Ordering::Relaxed))) + .collect(); + + Tensor::from_slice(&mod_x, (1, seq_len, mod_feature_size), device) + .map_err(|e| anyhow!("Failed to create tensor: {}", e)) } @@ -193,21 +134,9 @@ mod tests { use super::*; use candle_core::Device; use candle_core::Tensor; - use ndarray::Array2; use std::collections::HashMap; use std::path::PathBuf; - #[test] - fn test_get_aa_indices() { - let seq = "AGHCEWQMKYR"; - let result = get_aa_indices(seq).unwrap(); - // expected result is [[0, 1, 7, 8, 3, 5, 23, 17, 13, 11, 25, 18, 0]] - let expect_out = Array2::from_shape_vec((1, 13), vec![0, 1, 7, 8, 3, 5, 23, 17, 13, 11, 25, 18, 0]).unwrap(); - println!("{:?} - aa_indices: {:?}", seq, result); - assert_eq!(result.shape(), &[1, 13]); - assert_eq!(result, expect_out); - } - #[test] fn test_aa_indices_tensor(){ let device = Device::Cpu; @@ -221,32 +150,6 @@ mod tests { // assert_eq!(result.to_vec3::().unwrap(), expect_out.to_vec3::().unwrap()); } - #[test] - fn test_get_mod_features() { - let mods = "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M"; - let mod_sites = "0;4;8"; - let seq_len = 11 + 2; - let mod_feature_size = 109; - - let constants_path = - PathBuf::from("data/models/alphapeptdeep/generic/rt.pth.model_const.yaml"); - let constants: ModelConstants = - parse_model_constants(constants_path.to_str().unwrap()).unwrap(); - let mod_to_feature: HashMap> = load_mod_to_feature(&constants).unwrap(); - - let device = Device::Cpu; - let tensor = get_mod_features( - mods, - mod_sites, - seq_len, - mod_feature_size, - mod_to_feature, - device, - ).unwrap(); - println!("tensor shape: {:?}", tensor.shape()); - assert_eq!(tensor.shape().dims(), &[1, seq_len, mod_feature_size]); - } - #[test] fn test_get_mod_features_from_parsed() { let mods_str = "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M"; From f3a50131b035ca8e38f66fc2cbc4709176c345e4 Mon Sep 17 00:00:00 2001 From: singjc Date: Fri, 9 May 2025 00:46:52 -0400 Subject: [PATCH 12/75] refactor: Update RTCNNLSTMModel forward method to improve performance and readability --- .../src/models/rt_cnn_lstm_model.rs | 40 ++++++++----------- 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs b/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs index 51bfa11..89a6cec 100644 --- a/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs +++ b/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs @@ -1,25 +1,20 @@ -use anyhow::{anyhow, Result}; -use candle_core::{DType, Device, IndexOp, Tensor, Var, D}; -use candle_nn::{ops, Dropout, Module, Optimizer, VarBuilder, VarMap}; -use ndarray::Array2; -use serde::Deserialize; +use anyhow::Result; +use candle_core::{DType, Device, IndexOp, Tensor}; +use candle_nn::{Dropout, Module, VarBuilder, VarMap}; use std::collections::HashMap; use std::path::Path; -use log::info; -// use crate::models::rt_model::RTModel; -use crate::building_blocks::bilstm::BidirectionalLSTM; + + use crate::building_blocks::building_blocks::{ - DecoderLinear, Encoder26aaModCnnLstmAttnSum, AA_EMBEDDING_SIZE, MOD_FEATURE_SIZE, + DecoderLinear, Encoder26aaModCnnLstmAttnSum, MOD_FEATURE_SIZE, }; -use crate::building_blocks::featurize::{aa_one_hot, get_aa_indices, get_mod_features}; -use crate::models::model_interface::{ModelInterface, PropertyType, PredictionResult, load_tensors_from_model, create_var_map}; -use crate::utils::data_handling::PeptideData; +use crate::models::model_interface::{ModelInterface, PropertyType, load_tensors_from_model, create_var_map}; use crate::utils::peptdeep_utils::{ - extract_masses_and_indices, get_modification_indices, load_mod_to_feature, load_modifications, - parse_model_constants, remove_mass_shift, ModelConstants, ModificationMap, + load_mod_to_feature, + parse_model_constants, ModelConstants, }; -use crate::utils::logging::Progress; + // Main Model Struct @@ -115,17 +110,16 @@ impl ModelInterface for RTCNNLSTMModel { fn forward(&self, xs: &Tensor) -> Result { - let (batch_size, seq_len, _) = xs.shape().dims3()?; - - let start_mod_x = 1; + let (_batch_size, _seq_len, _) = xs.shape().dims3()?; + let aa_indices_out = xs.i((.., .., 0))?; - let mod_x_out = xs.i((.., .., start_mod_x..start_mod_x + MOD_FEATURE_SIZE))?; - + let mod_x_out = xs.i((.., .., 1..1 + MOD_FEATURE_SIZE))?; let x = self.rt_encoder.forward(&aa_indices_out, &mod_x_out)?; let x = self.dropout.forward(&x, self.is_training)?; let x = self.rt_decoder.forward(&x)?; + let result = x.squeeze(1)?; - Ok(x.squeeze(1)?) + Ok(result) } /// Set model to evaluation mode for inference @@ -279,11 +273,8 @@ impl ModelInterface for RTCNNLSTMModel { mod tests { use crate::models::model_interface::ModelInterface; use crate::models::rt_cnn_lstm_model::RTCNNLSTMModel; - use crate::utils::peptdeep_utils::load_modifications; use candle_core::Device; use std::path::PathBuf; - use std::time::Instant; - // use itertools::izip; use super::*; @@ -392,6 +383,7 @@ mod tests { // Test prediction with a few peptides after fine-tuning let test_peptides = vec![ + ("AGHCEWQMKYR", "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M", "0;4;8", 0.2945), ("QPYAVSELAGHQTSAESWGTGR", "", "", 0.4328955), ("GMSVSDLADKLSTDDLNSLIAHAHR", "Oxidation@M", "1", 0.6536107), ( From d1aea7907561a3b468a864db0e14bedcfe5b9d9f Mon Sep 17 00:00:00 2001 From: singjc Date: Fri, 9 May 2025 00:53:45 -0400 Subject: [PATCH 13/75] refactor: Update redeem-properties crate models to remove unused imports and improve code organization --- .../src/models/ccs_cnn_lstm_model.rs | 27 +- .../redeem-properties/src/models/ccs_model.rs | 2 +- .../src/models/model_interface.rs | 231 ++---------------- .../src/models/ms2_bert_model.rs | 109 +++++---- 4 files changed, 102 insertions(+), 267 deletions(-) diff --git a/crates/redeem-properties/src/models/ccs_cnn_lstm_model.rs b/crates/redeem-properties/src/models/ccs_cnn_lstm_model.rs index bde8a86..4a06304 100644 --- a/crates/redeem-properties/src/models/ccs_cnn_lstm_model.rs +++ b/crates/redeem-properties/src/models/ccs_cnn_lstm_model.rs @@ -1,27 +1,20 @@ -use anyhow::{anyhow, Result}; -use candle_core::{DType, Device, IndexOp, Tensor, Var, D}; +use anyhow::Result; +use candle_core::{DType, Device, IndexOp, Tensor}; use candle_nn::{ - ops, Dropout, Module, Optimizer, VarBuilder, VarMap, + Dropout, Module, VarBuilder, VarMap, }; -use log::info; -use ndarray::Array2; -use serde::Deserialize; + use std::collections::HashMap; -use std::process::Output; -use std::{char, fmt, vec}; +use std::{fmt, vec}; use std::path::Path; use crate::building_blocks::building_blocks::{ - DecoderLinear, Encoder26aaModChargeCnnLstmAttnSum, AA_EMBEDDING_SIZE, MOD_FEATURE_SIZE, + DecoderLinear, Encoder26aaModChargeCnnLstmAttnSum, MOD_FEATURE_SIZE, }; -use crate::building_blocks::featurize::{aa_one_hot, get_aa_indices, get_mod_features}; -use crate::utils::logging::Progress; -use crate::utils::data_handling::PeptideData; -use crate::utils::peptdeep_utils::{extract_masses_and_indices, get_modification_indices, remove_mass_shift}; use crate::{ - models::model_interface::{ModelInterface, PropertyType, PredictionResult,load_tensors_from_model, create_var_map}, + models::model_interface::{ModelInterface, PropertyType,load_tensors_from_model, create_var_map}, utils::peptdeep_utils::{ - load_mod_to_feature, parse_instrument_index, parse_model_constants, ModelConstants, + load_mod_to_feature, parse_model_constants, ModelConstants, }, }; @@ -144,7 +137,7 @@ impl ModelInterface for CCSCNNLSTMModel { fn forward(&self, xs: &Tensor) -> Result { - let (batch_size, seq_len, _) = xs.shape().dims3()?; + let (_batch_size, _seq_len, _) = xs.shape().dims3()?; // Separate input into aa_indices, mod_x, charge let start_mod_x = 1; @@ -288,8 +281,6 @@ mod tests { use super::*; use crate::models::model_interface::ModelInterface; use crate::models::ccs_cnn_lstm_model::CCSCNNLSTMModel; - use crate::utils::peptdeep_utils::load_modifications; - use crate::utils::data_handling::PeptideData; use candle_core::Device; use std::path::PathBuf; diff --git a/crates/redeem-properties/src/models/ccs_model.rs b/crates/redeem-properties/src/models/ccs_model.rs index 6adbdbb..4921bb4 100644 --- a/crates/redeem-properties/src/models/ccs_model.rs +++ b/crates/redeem-properties/src/models/ccs_model.rs @@ -1,5 +1,5 @@ use std::path::Path; -use candle_core::{Device, Tensor}; +use candle_core::Device; use anyhow::{Result, anyhow}; use crate::models::model_interface::{ModelInterface,PredictionResult}; use crate::models::ccs_cnn_lstm_model::CCSCNNLSTMModel; diff --git a/crates/redeem-properties/src/models/model_interface.rs b/crates/redeem-properties/src/models/model_interface.rs index 73070b5..b0e7655 100644 --- a/crates/redeem-properties/src/models/model_interface.rs +++ b/crates/redeem-properties/src/models/model_interface.rs @@ -1,25 +1,24 @@ use crate::{ - building_blocks::featurize::{self, aa_indices_tensor, get_aa_indices, get_mod_features, get_mod_features_from_parsed}, + building_blocks::featurize::{self, aa_indices_tensor, get_mod_features_from_parsed}, models::{ccs_model::CCSModelWrapper, ms2_model::MS2ModelWrapper, rt_model::RTModelWrapper}, utils::{ data_handling::PeptideData, logging::Progress, peptdeep_utils::{ get_modification_indices, get_modification_string, parse_instrument_index, - remove_mass_shift, ModificationMap, + remove_mass_shift, } }, }; use anyhow::{Context, Result}; use candle_core::{DType, Device, Tensor, Var}; -use candle_nn::{Module, Optimizer, VarMap}; +use candle_nn::{Optimizer, VarMap}; use log::info; use rayon::prelude::*; -use std::ops::{Index, IndexMut}; +use std::ops::Index; use std::path::Path; -use std::sync::{Arc, Mutex}; use std::{collections::HashMap, path::PathBuf}; -use itertools::izip; + // Constants const CHARGE_FACTOR: f64 = 0.1; @@ -287,9 +286,9 @@ pub trait ModelInterface: Send + Sync + ModelClone { ) -> Result { let device = self.get_device(); let mod_feature_size = self.get_mod_element_count(); - let mod_to_feature = self.get_mod_to_feature().clone(); + let mod_to_feature = self.get_mod_to_feature(); - let aa_tensor = aa_indices_tensor(peptide_sequence, &device)?; + let aa_tensor = aa_indices_tensor(peptide_sequence, device)?; let (batch_size, seq_len, _) = aa_tensor.shape().dims3()?; let mod_names: Vec<&str> = mods.split(';').filter(|s| !s.is_empty()).collect(); @@ -304,8 +303,8 @@ pub trait ModelInterface: Send + Sync + ModelClone { &mod_indices, seq_len, mod_feature_size, - &mod_to_feature, - &device, + mod_to_feature, + device, )?; let mut features = vec![aa_tensor, mod_tensor]; @@ -314,7 +313,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { let charge_tensor = Tensor::from_slice( &vec![c as f64 * CHARGE_FACTOR; seq_len], &[batch_size, seq_len, 1], - &device, + device, )?.to_dtype(DType::F32)?; features.push(charge_tensor); } @@ -323,7 +322,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { let nce_tensor = Tensor::from_slice( &vec![n as f64 * NCE_FACTOR; seq_len], &[batch_size, seq_len, 1], - &device, + device, )?.to_dtype(DType::F32)?; features.push(nce_tensor); } @@ -333,12 +332,16 @@ pub trait ModelInterface: Send + Sync + ModelClone { let instr_tensor = Tensor::from_slice( &vec![instr_idx; seq_len], &[batch_size, seq_len, 1], - &device, + device, )?.to_dtype(DType::F32)?; features.push(instr_tensor); } - Ok(Tensor::cat(&features, 2)?) + if features.len() == 1 { + Ok(features.remove(0)) + } else { + Ok(Tensor::cat(&features, 2)?) + } } /// Encode a batch of peptide sequences into a tensor @@ -352,7 +355,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { instruments: Option>, ) -> Result { let len = peptide_sequences.len(); - + let tensors: Vec<_> = (0..len) .into_par_iter() .map(|i| { @@ -365,14 +368,18 @@ pub trait ModelInterface: Send + Sync + ModelClone { instruments.as_ref().map(|v| v[i].as_str()), ) }) - .collect::, _>>()?; // Propagate errors - + .collect::>>()?; + + if tensors.is_empty() { + return Err(anyhow::anyhow!("Encoding batch of peptides failed, the resulting tesnor batch is empty.")); + } + let max_len = tensors .iter() .map(|t| t.shape().dims3().unwrap().1) .max() .unwrap_or(0); - + let padded = tensors .into_par_iter() .map(|t| { @@ -386,195 +393,9 @@ pub trait ModelInterface: Send + Sync + ModelClone { }) .map(|res| res.map_err(anyhow::Error::from)) .collect::, _>>()?; - + Ok(Tensor::cat(&padded, 0)?) } - - - // /// Encode a batch of peptide sequences (plus modifications) into a tensor. - // /// - // /// # Arguments - // /// * `peptide_sequences` - A vector of peptide sequences. - // /// * `mods` - A vector of strings representing the modifications for each peptide. - // /// * `mod_sites` - A vector of strings representing the modification site indices for each peptide. - // /// * `charge` - An optional vector of charge states for each peptide. - // /// * `nce` - An optional vector of nominal collision energies for each peptide. - // /// * `instruments` - An optional vector of instrument names for each peptide. - // /// - // /// # Returns - // /// A tensor containing the encoded peptide sequences. - // fn encode_peptides( - // &self, - // peptide_sequences: &[String], - // mods: &[String], - // mod_sites: &[String], - // charges: Option>, - // nces: Option>, - // instruments: Option>, - // ) -> Result { - // if peptide_sequences.len() != mods.len() || peptide_sequences.len() != mod_sites.len() { - // return Err(anyhow::anyhow!( - // "Mismatch in input lengths: peptide_sequences, mods, and mod_sites must have the same length." - // )); - // } - - // // Encode peptides in parallel using Rayon - // let encoded_tensors: Vec = peptide_sequences - // .par_iter() // Use Rayon's parallel iterator - // .enumerate() - // .map(|(i, peptide)| { - // self.encode_peptide( - // peptide, - // &mods[i], - // &mod_sites[i], - // charges.as_ref().map(|c| c[i]), - // nces.as_ref().map(|n| n[i]), - // instruments.as_ref().map(|ins| ins[i].as_str()), - // ) - // }) - // .collect::>>()?; // Collect results and propagate errors if any - - // // Determine the maximum sequence length - // let max_seq_len = encoded_tensors - // .par_iter() - // .map(|t| t.shape().dims3().unwrap().1) // Get sequence length (dimension 1) - // .max() - // .unwrap_or(0); - - // // Pad tensors to the max_seq_len - // let padded_tensors: Result> = encoded_tensors - // .into_par_iter() // Use Rayon's parallel iterator - // .map(|t| { - // let (_, seq_len, feature_size) = t.shape().dims3()?; // Extract feature dimension - // if seq_len < max_seq_len { - // let pad_size = max_seq_len - seq_len; - // // Create a padding tensor with the correct shape and type - // let pad = Tensor::zeros( - // &[1, pad_size, feature_size], // Use the correct feature dimension - // t.dtype(), - // t.device(), - // )?; - // // Concatenate padding along sequence length - // Tensor::cat(&[&t, &pad], 1) - // } else { - // Ok(t) - // } - // }) - // .collect::, _>>() - // .map_err(Into::into); - - // let padded_tensors = padded_tensors?; - - // // Concatenate all padded tensors along the batch dimension - // let batch_tensor = Tensor::cat(&padded_tensors, 0)?; - - // Ok(batch_tensor) - // } - - // /// Encode peptide sequence (plus modifications) into a tensor. - // /// - // /// # Arguments - // /// * `peptide_sequence` - The peptide sequence. - // /// * `mods` - A string representing the modifications for the peptide. - // /// * `mod_sites` - A string representing the modification site indices for the peptide. - // /// * `charge` - An optional charge state for the peptide. - // /// * `nce` - An optional nominal collision energy for the peptide. - // /// * `instrument` - An optional instrument name for the peptide. - // /// - // /// # Returns - // /// A tensor containing the encoded peptide sequence. - // fn encode_peptide( - // &self, - // peptide_sequence: &str, - // mods: &str, - // mod_sites: &str, - // charge: Option, - // nce: Option, - // instrument: Option<&str>, - // ) -> Result { - // log::trace!( - // "[ModelInterface::encode_peptide] Encoding peptide: {:?}, mods: {:?}, mod_sites: {:?}, charge: {:?}, nce: {:?}, instrument: {:?}", - // peptide_sequence, - // mods, - // mod_sites, - // charge, - // nce, - // instrument - // ); - // let aa_indices = get_aa_indices(peptide_sequence)?; - // log::trace!( - // "[ModelInterface::encode_peptide] aa_indices_tensor shape: {:?}, min: {:?}, max: {:?}", - // aa_indices.shape(), - // aa_indices.iter().min(), - // aa_indices.iter().max() - // ); - - // // Convert ndarray to Tensor (F32) - // let aa_indices_tensor = Tensor::from_slice( - // &aa_indices.as_slice().unwrap(), - // (aa_indices.shape()[0], aa_indices.shape()[1]), - // &self.get_device(), - // )? - // .to_dtype(DType::F32)?; - - // let (batch_size, seq_len) = aa_indices_tensor.shape().dims2()?; - // let aa_indices_tensor = aa_indices_tensor.unsqueeze(2)?; // Shape: batch_size x seq_len x 1 - - // log::trace!( - // "[ModelInterface::encode_peptide] aa_indices_tensor shape: {:?}, min: {:?}, max: {:?}", - // aa_indices_tensor.shape(), - // aa_indices_tensor.min_all(), - // aa_indices_tensor.max_all() - // ); - - // // Get modification features - // let mod_x = get_mod_features( - // mods, - // mod_sites, - // seq_len, - // self.get_mod_element_count(), - // self.get_mod_to_feature().clone(), - // self.get_device().clone(), - // )?; - - // let mut features = vec![aa_indices_tensor, mod_x]; - - // // Conditionally add charge - // if let Some(c) = charge { - // let charge_tensor = Tensor::from_slice( - // &vec![c as f64 * CHARGE_FACTOR; seq_len], - // &[batch_size, seq_len, 1], - // &self.get_device(), - // )? - // .to_dtype(DType::F32)?; - // features.push(charge_tensor); - // } - - // // Conditionally add NCE - // if let Some(n) = nce { - // let nce_tensor = Tensor::from_slice( - // &vec![n as f64 * NCE_FACTOR; seq_len], - // &[batch_size, seq_len, 1], - // &self.get_device(), - // )? - // .to_dtype(DType::F32)?; - // features.push(nce_tensor); - // } - - // // Conditionally add instrument - // if let Some(instr) = instrument { - // let instrument_tensor = Tensor::from_slice( - // &vec![parse_instrument_index(instr) as u32; seq_len], - // &[batch_size, seq_len, 1], - // &self.get_device(), - // )? - // .to_dtype(DType::F32)?; - // features.push(instrument_tensor); - // } - - // // Concatenate features - // Ok(Tensor::cat(&features, 2)?) - // } /// Fine-tune the model on a batch of training data. /// diff --git a/crates/redeem-properties/src/models/ms2_bert_model.rs b/crates/redeem-properties/src/models/ms2_bert_model.rs index 9f3a1d3..811a50c 100644 --- a/crates/redeem-properties/src/models/ms2_bert_model.rs +++ b/crates/redeem-properties/src/models/ms2_bert_model.rs @@ -1,32 +1,19 @@ -use anyhow::{anyhow, Result}; -use candle_core::{DType, Device, IndexOp, Tensor, Var, D}; -use candle_nn::{ - ops, Conv1d, Conv1dConfig, Dropout, Linear, Module, Optimizer, PReLU, VarBuilder, VarMap, -}; -use log::info; -use ndarray::Array2; -use serde::Deserialize; +use anyhow::Result; +use candle_core::{DType, Device, IndexOp, Tensor}; +use candle_nn::{Dropout, Module, VarBuilder, VarMap}; use std::collections::HashMap; use std::fmt; use std::path::Path; use crate::{ - building_blocks::{ - building_blocks::{ - DecoderLinear, HiddenHfaceTransformer, Input26aaModPositionalEncoding, MetaEmbedding, - ModLossNN, AA_EMBEDDING_SIZE, MOD_FEATURE_SIZE, - }, - featurize::{aa_one_hot, get_aa_indices, get_mod_features}, + building_blocks::building_blocks::{ + DecoderLinear, HiddenHfaceTransformer, Input26aaModPositionalEncoding, MetaEmbedding, + ModLossNN, MOD_FEATURE_SIZE, }, - models::model_interface::{load_tensors_from_model, create_var_map, ModelInterface, PropertyType}, - utils::{ - data_handling::PeptideData, - logging::Progress, - peptdeep_utils::{ - get_modification_indices, get_modification_string, load_mod_to_feature, - parse_model_constants, remove_mass_shift, ModelConstants, - }, + models::model_interface::{ + create_var_map, load_tensors_from_model, ModelInterface, PropertyType, }, + utils::peptdeep_utils::{load_mod_to_feature, parse_model_constants, ModelConstants}, }; // Constants @@ -183,7 +170,7 @@ impl ModelInterface for MS2BertModel { } fn forward(&self, xs: &Tensor) -> Result { - let (batch_size, seq_len, _) = xs.shape().dims3()?; + let (_batch_size, seq_len, _) = xs.shape().dims3()?; // Separate the input tensor into the different parts @@ -206,18 +193,42 @@ impl ModelInterface for MS2BertModel { let nce_out = nce_out.squeeze(2)?; // Squeeze to remove dimensions of size 1 if needed let instrument_out = instrument_out.squeeze(2)?.squeeze(1)?; // Squeeze to remove dimensions of size 1 if needed - log::trace!("[MS2BertModel::forward] aa_indices_out shape: {:?}, device: {:?}", aa_indices_out.shape(), aa_indices_out.device()); - log::trace!("[MS2BertModel::forward] mod_x_out shape: {:?}, device: {:?}", mod_x_out.shape(), mod_x_out.device()); - log::trace!("[MS2BertModel::forward] charge_out shape: {:?}, device: {:?}", charge_out.shape(), charge_out.device()); - log::trace!("[MS2BertModel::forward] nce_out shape: {:?}, device: {:?}", nce_out.shape(), nce_out.device()); - log::trace!("[MS2BertModel::forward] instrument_out shape: {:?}, device: {:?}", instrument_out.shape(), instrument_out.device()); + log::trace!( + "[MS2BertModel::forward] aa_indices_out shape: {:?}, device: {:?}", + aa_indices_out.shape(), + aa_indices_out.device() + ); + log::trace!( + "[MS2BertModel::forward] mod_x_out shape: {:?}, device: {:?}", + mod_x_out.shape(), + mod_x_out.device() + ); + log::trace!( + "[MS2BertModel::forward] charge_out shape: {:?}, device: {:?}", + charge_out.shape(), + charge_out.device() + ); + log::trace!( + "[MS2BertModel::forward] nce_out shape: {:?}, device: {:?}", + nce_out.shape(), + nce_out.device() + ); + log::trace!( + "[MS2BertModel::forward] instrument_out shape: {:?}, device: {:?}", + instrument_out.shape(), + instrument_out.device() + ); // Forward pass through input_nn with dropout let in_x = self .dropout .forward(&self.input_nn.forward(&aa_indices_out, &mod_x_out)?, true)?; - log::trace!("[MS2BertModel::forward] in_x shape (post dropout-input_nn): {:?}, device: {:?}", in_x.shape(), in_x.device()); + log::trace!( + "[MS2BertModel::forward] in_x shape (post dropout-input_nn): {:?}, device: {:?}", + in_x.shape(), + in_x.device() + ); // Prepare metadata for meta_nn let meta_x = self @@ -225,17 +236,27 @@ impl ModelInterface for MS2BertModel { .forward(&charge_out, &nce_out, &instrument_out)? .unsqueeze(1)? .repeat(vec![1, seq_len as usize, 1])?; - log::trace!("[MS2BertModel::forward] meta_x (post meta_nn) shape: {:?}, device: {:?}", meta_x.shape(), meta_x.device()); + log::trace!( + "[MS2BertModel::forward] meta_x (post meta_nn) shape: {:?}, device: {:?}", + meta_x.shape(), + meta_x.device() + ); // Concatenate in_x and meta_x along dimension 2 let combined_input = Tensor::cat(&[in_x.clone(), meta_x], 2)?; - log::trace!("[MS2BertModel::forward] combined_input shape: {:?}, device: {:?}", combined_input.shape(), combined_input.device()); + log::trace!( + "[MS2BertModel::forward] combined_input shape: {:?}, device: {:?}", + combined_input.shape(), + combined_input.device() + ); // Forward pass through hidden_nn - let hidden_x = self - .hidden_nn - .forward(&combined_input.clone(), None)?; - log::trace!("[MS2BertModel::forward] hidden_x shape: {:?}, device: {:?}", hidden_x.shape(), hidden_x.device()); + let hidden_x = self.hidden_nn.forward(&combined_input.clone(), None)?; + log::trace!( + "[MS2BertModel::forward] hidden_x shape: {:?}, device: {:?}", + hidden_x.shape(), + hidden_x.device() + ); // // Handle attentions if needed (similar to PyTorch) // if self.output_attentions { @@ -247,11 +268,19 @@ impl ModelInterface for MS2BertModel { // Apply dropout and combine with input let x_tmp = (hidden_x + combined_input * 0.2)?; let hidden_output = self.dropout.forward(&x_tmp, true)?; - log::trace!("[MS2BertModel::forward] hidden_output shape: {:?}, device: {:?}", hidden_output.shape(), hidden_output.device()); + log::trace!( + "[MS2BertModel::forward] hidden_output shape: {:?}, device: {:?}", + hidden_output.shape(), + hidden_output.device() + ); // Forward pass through output_nn let mut out_x = self.output_nn.forward(&hidden_output)?; - log::trace!("[MS2BertModel::forward] out_x shape: {:?}, device: {:?}", out_x.shape(), out_x.device()); + log::trace!( + "[MS2BertModel::forward] out_x shape: {:?}, device: {:?}", + out_x.shape(), + out_x.device() + ); // Handle modloss if applicable (similar logic as PyTorch) if self.num_modloss_types > 0 { @@ -338,7 +367,6 @@ impl ModelInterface for MS2BertModel { fn print_weights(&self) { todo!() } - } // // Module Trait Implementation @@ -403,12 +431,7 @@ mod tests { use super::*; use crate::models::model_interface::ModelInterface; use crate::models::ms2_bert_model::MS2BertModel; - use crate::utils::peptdeep_utils::load_modifications; use candle_core::Device; - use csv::Reader; - use rayon::vec; - use std::collections::HashMap; - use std::fs::File; use std::path::PathBuf; #[test] From e67695109723d8daa5bf01b729f2ad57fb34c5a7 Mon Sep 17 00:00:00 2001 From: singjc Date: Fri, 9 May 2025 00:59:57 -0400 Subject: [PATCH 14/75] add: TransformerEncoder and SeqTransformer block --- .../src/building_blocks/building_blocks.rs | 315 ++++++++++++++++-- .../src/building_blocks/nn.rs | 196 ++++++++++- 2 files changed, 488 insertions(+), 23 deletions(-) diff --git a/crates/redeem-properties/src/building_blocks/building_blocks.rs b/crates/redeem-properties/src/building_blocks/building_blocks.rs index 2ec0f85..d1afa48 100644 --- a/crates/redeem-properties/src/building_blocks/building_blocks.rs +++ b/crates/redeem-properties/src/building_blocks/building_blocks.rs @@ -4,12 +4,15 @@ use candle_nn as nn; use candle_transformers as transformers; use core::num; use std::fmt; +use std::time::Instant; use crate::building_blocks::bilstm::BidirectionalLSTM; use crate::building_blocks::featurize::aa_one_hot; use crate::building_blocks::nn::{BertEncoderModule, ModuleList}; use crate::building_blocks::sequential::{seq, Sequential}; +use super::nn::TransformerEncoder; + /// constants used by PeptDeep Models pub const MOD_FEATURE_SIZE: usize = 109; // TODO: derive from constants yaml pub const AA_EMBEDDING_SIZE: usize = 27; // TODO: derive from constants yaml @@ -302,7 +305,7 @@ pub struct MetaEmbedding { } impl MetaEmbedding { - fn new(out_features: usize, device: &Device) -> Result { + fn _new(out_features: usize, device: &Device) -> Result { let nn = nn::linear( MAX_INSTRUMENT_NUM + 1, out_features - 1, @@ -432,13 +435,13 @@ pub struct HiddenHfaceTransformer { } impl HiddenHfaceTransformer { - fn new( - hidden_dim: usize, - hidden_expand: usize, - nheads: usize, - nlayers: usize, - dropout: f64, - output_attentions: bool + fn _new( + _hidden_dim: usize, + _hidden_expand: usize, + _nheads: usize, + _nlayers: usize, + _dropout: f64, + _output_attentions: bool ) -> Result { unimplemented!() } @@ -450,7 +453,7 @@ impl HiddenHfaceTransformer { nheads: usize, nlayers: usize, dropout: f64, - output_attentions: bool + _output_attentions: bool ) -> Result { let config = transformers::models::bert::Config { hidden_size: hidden_dim, @@ -583,8 +586,48 @@ struct SeqCNN { } impl SeqCNN { - fn new() -> Self { - unimplemented!(); + pub fn new(embedding_hidden: usize, device: &Device) -> Result { + let varmap = nn::VarMap::new(); + let varbuilder = nn::VarBuilder::from_varmap(&varmap, DType::F32, device); + + let cnn_short = nn::conv1d( + embedding_hidden, + embedding_hidden, + 3, + nn::Conv1dConfig { + padding: 1, + ..Default::default() + }, + varbuilder.pp("cnn_short"), + )?; + + let cnn_medium = nn::conv1d( + embedding_hidden, + embedding_hidden, + 5, + nn::Conv1dConfig { + padding: 2, + ..Default::default() + }, + varbuilder.pp("cnn_medium"), + )?; + + let cnn_long = nn::conv1d( + embedding_hidden, + embedding_hidden, + 7, + nn::Conv1dConfig { + padding: 3, + ..Default::default() + }, + varbuilder.pp("cnn_long"), + )?; + + Ok(Self { + cnn_short, + cnn_medium, + cnn_long, + }) } pub fn from_varstore( @@ -654,7 +697,7 @@ struct SeqLSTM { } impl SeqLSTM { - fn new() -> Self { + fn _new() -> Self { unimplemented!(); } @@ -675,6 +718,90 @@ impl Module for SeqLSTM { } } +/// Transformer block applied on sequence input using a custom Transformer encoder implementation. +/// This replaces the LSTM with a Transformer encoder for sequence modeling. +#[derive(Debug, Clone)] +pub struct SeqTransformer { + encoder: TransformerEncoder, + training: bool, +} + +impl SeqTransformer { + /// Construct a new transformer encoder block for sequence modeling. + /// + /// # Arguments + /// * `input_dim` - The input embedding dimension (e.g., CNN output). + /// * `model_dim` - The internal model dimension of the transformer. + /// * `ff_dim` - The feedforward hidden layer dimension. + /// * `num_heads` - Number of attention heads. + /// * `num_layers` - Number of transformer encoder layers. + /// * `max_len` - Maximum input sequence length. + /// * `dropout_prob` - Dropout probability. + /// * `device` - The device to place the tensors on. + pub fn new( + input_dim: usize, + model_dim: usize, + ff_dim: usize, + num_heads: usize, + num_layers: usize, + max_len: usize, + dropout_prob: f32, + device: &Device, + ) -> Result { + let varmap = nn::VarMap::new(); + let varbuilder = nn::VarBuilder::from_varmap(&varmap, DType::F32, device); + let encoder = TransformerEncoder::new( + &varbuilder, + input_dim, + model_dim, + ff_dim, + num_heads, + num_layers, + max_len, + dropout_prob, + device, + )?; + Ok(Self { encoder, training: true }) + } + + /// Load a transformer encoder from a varstore (used when loading from pre-trained weights). + pub fn from_varstore( + varstore: nn::VarBuilder, + input_dim: usize, + model_dim: usize, + ff_dim: usize, + num_heads: usize, + num_layers: usize, + max_len: usize, + dropout_prob: f32, + device: &Device, + ) -> Result { + let encoder = TransformerEncoder::new( + &varstore, + input_dim, + model_dim, + ff_dim, + num_heads, + num_layers, + max_len, + dropout_prob, + device, + )?; + Ok(Self { encoder, training: true }) + } + + pub fn set_training(&mut self, training: bool) { + self.training = training; + } +} + +impl Module for SeqTransformer { + fn forward(&self, x: &Tensor) -> Result { + self.encoder.forward_with_mask(x, None, self.training) + } +} + + /// apply linear transformation and tensor rescaling with softmax #[derive(Debug, Clone)] struct SeqAttentionSum { @@ -682,6 +809,16 @@ struct SeqAttentionSum { } impl SeqAttentionSum { + pub fn new(hidden_dim: usize, device: &Device) -> Result { + let varmap = nn::VarMap::new(); + let varbuilder = nn::VarBuilder::from_varmap(&varmap, DType::F32, device); + let attention = nn::Linear::new( + varbuilder.get((1, hidden_dim), "attention.weight")?, + None, + ); + Ok(Self { attention }) + } + pub fn from_varstore(varstore: nn::VarBuilder, hidden_dim: usize, name: &str) -> Result { let attention = nn::Linear::new(varstore.get((1, hidden_dim), name).unwrap(), None); Ok(Self { attention }) @@ -719,7 +856,7 @@ pub struct Encoder26aaModCnnLstmAttnSum { } impl Encoder26aaModCnnLstmAttnSum { - fn new() -> Self { + fn _new() -> Self { unimplemented!(); } @@ -764,14 +901,25 @@ impl Encoder26aaModCnnLstmAttnSum { pub fn forward(&self, aa_indices: &Tensor, mod_x: &Tensor) -> Result { + let start_time = Instant::now(); let mod_x = self.mod_nn.forward(mod_x)?; + println!("Encoder26aaModCnnLstmAttnSum::forward - mod_x forward time: {:.3?}", start_time.elapsed()); + let start_time = Instant::now(); let additional_tensors: Vec<&Tensor> = vec![&mod_x]; + println!("Encoder26aaModCnnLstmAttnSum::forward - additional_tensors forward time: {:.3?}", start_time.elapsed()); + let start_time = Instant::now(); let x = aa_one_hot(&aa_indices, &additional_tensors) .map_err(|e| candle_core::Error::Msg(e.to_string()))?; - + println!("Encoder26aaModCnnLstmAttnSum::forward - aa_one_hot forward time: {:.3?}", start_time.elapsed()); + let start_time = Instant::now(); let x = self.input_cnn.forward(&x)?; + println!("Encoder26aaModCnnLstmAttnSum::forward - input_cnn forward time: {:.3?}", start_time.elapsed()); + let start_time = Instant::now(); let x = self.input_lstm.forward(&x)?; + println!("Encoder26aaModCnnLstmAttnSum::forward - input_lstm forward time: {:.3?}", start_time.elapsed()); + let start_time = Instant::now(); let x = self.attn_sum.forward(&x)?; + println!("Encoder26aaModCnnLstmAttnSum::forward - attn_sum forward time: {:.3?}", start_time.elapsed()); Ok(x) } } @@ -786,7 +934,7 @@ pub struct Encoder26aaModChargeCnnLstmAttnSum { } impl Encoder26aaModChargeCnnLstmAttnSum { - fn new() -> Self { + fn _new() -> Self { unimplemented!(); } @@ -831,29 +979,156 @@ impl Encoder26aaModChargeCnnLstmAttnSum { pub fn forward(&self, aa_indices: &Tensor, mod_x: &Tensor, charges: &Tensor) -> Result { + let start_time = Instant::now(); let mod_x = self.mod_nn.forward(mod_x)?; + println!("Encoder26aaModChargeCnnLstmAttnSum::forward - mod_x forward time: {:.3?}", start_time.elapsed()); + let start_time = Instant::now(); let charges_repeated = charges.unsqueeze(1)?.repeat(&[1, mod_x.dim(1)?, 1])?; + println!("Encoder26aaModChargeCnnLstmAttnSum::forward - charges_repeated forward time: {:.3?}", start_time.elapsed()); + let start_time = Instant::now(); let additional_tensors: Vec<&Tensor> = vec![&mod_x, &charges_repeated]; + println!("Encoder26aaModChargeCnnLstmAttnSum::forward - additional_tensors forward time: {:.3?}", start_time.elapsed()); + let start_time = Instant::now(); let x = aa_one_hot(&aa_indices, &additional_tensors) .map_err(|e| candle_core::Error::Msg(e.to_string()))?; + println!("Encoder26aaModChargeCnnLstmAttnSum::forward - aa_one_hot forward time: {:.3?}", start_time.elapsed()); + let start_time = Instant::now(); let x = self.input_cnn.forward(&x)?; + println!("Encoder26aaModChargeCnnLstmAttnSum::forward - input_cnn forward time: {:.3?}", start_time.elapsed()); + let start_time = Instant::now(); let x = self.input_lstm.forward(&x)?; + println!("Encoder26aaModChargeCnnLstmAttnSum::forward - input_lstm forward time: {:.3?}", start_time.elapsed()); + let start_time = Instant::now(); + let x = self.attn_sum.forward(&x)?; + println!("Encoder26aaModChargeCnnLstmAttnSum::forward - attn_sum forward time: {:.3?}", start_time.elapsed()); + Ok(x) + } +} + + +/// Encode AAs (26 AA letters) and modifications using CNN + Transformer + AttentionSum. +#[derive(Debug, Clone)] +pub struct Encoder26aaModCnnTransformerAttnSum { + mod_nn: ModEmbeddingFixFirstK, + input_cnn: SeqCNN, + input_transformer: SeqTransformer, + attn_sum: SeqAttentionSum, +} + +impl Encoder26aaModCnnTransformerAttnSum { + pub fn from_varstore( + varstore: &nn::VarBuilder, + mod_hidden_dim: usize, + hidden_dim: usize, + ff_dim: usize, + num_heads: usize, + num_layers: usize, + max_len: usize, + dropout_prob: f32, + names_mod_nn: Vec<&str>, + names_input_cnn_weight: Vec<&str>, + names_input_cnn_bias: Vec<&str>, + transformer_pp: &str, + names_attn_sum: Vec<&str>, + device: &Device, + ) -> Result { + let input_dim = AA_EMBEDDING_SIZE + mod_hidden_dim; + Ok(Self { + mod_nn: ModEmbeddingFixFirstK::from_varstore( + &varstore, + MOD_FEATURE_SIZE, + mod_hidden_dim, + names_mod_nn[0], + )?, + input_cnn: SeqCNN::from_varstore( + varstore.clone(), + input_dim, + names_input_cnn_weight, + names_input_cnn_bias, + )?, + input_transformer: SeqTransformer::from_varstore( + varstore.pp(transformer_pp).clone(), + input_dim * 4, + hidden_dim, + ff_dim, + num_heads, + num_layers, + max_len, + dropout_prob, + device, + )?, + attn_sum: SeqAttentionSum::from_varstore( + varstore.clone(), + hidden_dim, + names_attn_sum[0], + )?, + }) + } + + /// Construct a CNN+Transformer+Attention encoder from scratch (no pretrained weights). + pub fn new( + device: &Device, + mod_hidden_dim: usize, + hidden_dim: usize, + ff_dim: usize, + num_heads: usize, + num_layers: usize, + max_len: usize, + dropout_prob: f32, + ) -> Result { + let input_dim = AA_EMBEDDING_SIZE + mod_hidden_dim; + Ok(Self { + mod_nn: ModEmbeddingFixFirstK::new(MOD_FEATURE_SIZE, mod_hidden_dim, device)?, + input_cnn: SeqCNN::new(input_dim, device)?, + input_transformer: SeqTransformer::new( + input_dim * 4, + hidden_dim, + ff_dim, + num_heads, + num_layers, + max_len, + dropout_prob, + device, + )?, + attn_sum: SeqAttentionSum::new(hidden_dim, device)?, + }) + } + + pub fn forward(&self, aa_indices: &Tensor, mod_x: &Tensor) -> Result { + let start_time = Instant::now(); + let mod_x = self.mod_nn.forward(mod_x)?; + println!("Encoder26aaModCnnTransformerAttnSum::forward - mod_x forward time: {:.3?}", start_time.elapsed()); + + let additional_tensors: Vec<&Tensor> = vec![&mod_x]; + let start_time = Instant::now(); + let x = aa_one_hot(aa_indices, &additional_tensors) + .map_err(|e| candle_core::Error::Msg(e.to_string()))?; + println!("Encoder26aaModCnnTransformerAttnSum::forward - aa_one_hot forward time: {:.3?}", start_time.elapsed()); + + let start_time = Instant::now(); + let x = self.input_cnn.forward(&x)?; + println!("Encoder26aaModCnnTransformerAttnSum::forward - input_cnn forward time: {:.3?}", start_time.elapsed()); + + let start_time = Instant::now(); + let x = self.input_transformer.forward(&x)?; + println!("Encoder26aaModCnnTransformerAttnSum::forward - input_transformer forward time: {:.3?}", start_time.elapsed()); + + let start_time = Instant::now(); let x = self.attn_sum.forward(&x)?; + println!("Encoder26aaModCnnTransformerAttnSum::forward - attn_sum forward time: {:.3?}", start_time.elapsed()); + Ok(x) } } + + #[cfg(test)] mod tests { - use crate::models::model_interface::ModelInterface; - use crate::models::rt_cnn_lstm_model::RTCNNLSTMModel; - use crate::utils::peptdeep_utils::load_modifications; use candle_core::Device; use candle_nn::VarBuilder; use std::path::PathBuf; - use std::time::Instant; - // use itertools::izip; use super::*; diff --git a/crates/redeem-properties/src/building_blocks/nn.rs b/crates/redeem-properties/src/building_blocks/nn.rs index 753bedc..1373032 100644 --- a/crates/redeem-properties/src/building_blocks/nn.rs +++ b/crates/redeem-properties/src/building_blocks/nn.rs @@ -1,7 +1,8 @@ -use candle_core::{Result, Tensor}; -use candle_nn::Module; -use std::ops::{Deref, DerefMut}; +use candle_core::{Device, IndexOp, Result, Tensor}; +use candle_nn::{Dropout, LayerNorm, Linear, Module, VarBuilder}; use candle_transformers::models::bert::{BertEncoder, Config}; +use candle_nn::ops::softmax; +use std::ops::{Deref, DerefMut}; use std::sync::Arc; #[derive(Clone)] @@ -68,3 +69,192 @@ impl Module for BertEncoderModule { self.encoder.forward(hidden_states, &attention_mask) } } + + +/// A minimal Transformer encoder layer with multi-head self-attention, feedforward block, +/// dropout, and optional sinusoidal positional encoding and padding mask support. +#[derive(Debug, Clone)] +pub struct TransformerEncoder { + layers: Vec, + pos_encoding: Tensor, + dropout: Dropout, +} + +impl TransformerEncoder { + pub fn new( + varbuilder: &VarBuilder, + input_dim: usize, + model_dim: usize, + ff_dim: usize, + num_heads: usize, + num_layers: usize, + max_len: usize, + dropout_prob: f32, + device: &Device, + ) -> Result { + let mut layers = Vec::new(); + for i in 0..num_layers { + let layer = TransformerEncoderLayer::new( + &varbuilder.pp(&format!("layer_{}", i)), + input_dim, + model_dim, + ff_dim, + num_heads, + dropout_prob, + )?; + layers.push(layer); + } + let pos_encoding = create_sinusoidal_encoding(max_len, model_dim, device)?; + let dropout = Dropout::new(dropout_prob); + Ok(Self { layers, pos_encoding, dropout }) + } + + pub fn forward_with_mask(&self, x: &Tensor, padding_mask: Option<&Tensor>, training: bool) -> Result { + let (b, t, _) = x.dims3()?; + let pe = self.pos_encoding.i((..t, ..))?.unsqueeze(0)?.broadcast_as((b, t, self.pos_encoding.dim(1)?))?; + let mut out = x + pe; + out = self.dropout.forward(&out?, training); + for layer in &self.layers { + out = layer.forward(&out?, padding_mask, training); + } + Ok(out?) + } +} + +#[derive(Debug, Clone)] +pub struct TransformerEncoderLayer { + self_attn: MultiHeadAttention, + ff: FeedForward, + norm1: LayerNorm, + norm2: LayerNorm, + dropout1: Dropout, + dropout2: Dropout, +} + +impl TransformerEncoderLayer { + pub fn new( + varbuilder: &VarBuilder, + input_dim: usize, + model_dim: usize, + ff_dim: usize, + num_heads: usize, + dropout_prob: f32, + ) -> Result { + Ok(Self { + self_attn: MultiHeadAttention::new(varbuilder, input_dim, model_dim, num_heads)?, + ff: FeedForward::new(varbuilder, model_dim, ff_dim)?, + norm1: { + let weight = varbuilder.get((model_dim,), "norm1.weight")?; + let bias = varbuilder.get((model_dim,), "norm1.bias")?; + LayerNorm::new(weight, bias, 1e-5) + }, + norm2: { + let weight = varbuilder.get((model_dim,), "norm2.weight")?; + let bias = varbuilder.get((model_dim,), "norm2.bias")?; + LayerNorm::new(weight, bias, 1e-5) + }, + dropout1: Dropout::new(dropout_prob), + dropout2: Dropout::new(dropout_prob), + }) + } + + pub fn forward(&self, x: &Tensor, mask: Option<&Tensor>, training: bool) -> Result { + let attn = self.self_attn.forward(x, mask)?; + let x = self.norm1.forward(&(x + self.dropout1.forward(&attn, training)?)?)?; + let ff = self.ff.forward(&x)?; + self.norm2.forward(&(x + self.dropout2.forward(&ff, training)?)?) + } +} + +#[derive(Debug, Clone)] +pub struct MultiHeadAttention { + proj_q: Linear, + proj_k: Linear, + proj_v: Linear, + proj_out: Linear, + num_heads: usize, + head_dim: usize, +} + +impl MultiHeadAttention { + pub fn new( + varbuilder: &VarBuilder, + input_dim: usize, + model_dim: usize, + num_heads: usize, + ) -> Result { + let head_dim = model_dim / num_heads; + Ok(Self { + proj_q: linear_from_varbuilder(varbuilder, input_dim, model_dim, "proj_q")?, + proj_k: linear_from_varbuilder(varbuilder, input_dim, model_dim, "proj_k")?, + proj_v: linear_from_varbuilder(varbuilder, input_dim, model_dim, "proj_v")?, + proj_out: linear_from_varbuilder(varbuilder, input_dim, model_dim, "proj_out")?, + num_heads, + head_dim, + }) + } + + pub fn forward(&self, x: &Tensor, mask: Option<&Tensor>) -> Result { + let (b, t, _) = x.dims3()?; + let q = self.proj_q.forward(x)?.reshape((b, t, self.num_heads, self.head_dim))?.transpose(1, 2)?; + let k = self.proj_k.forward(x)?.reshape((b, t, self.num_heads, self.head_dim))?.transpose(1, 2)?; + let v = self.proj_v.forward(x)?.reshape((b, t, self.num_heads, self.head_dim))?.transpose(1, 2)?; + + let mut scores = q.matmul(&k.transpose(2, 3)?)? / (self.head_dim as f64).sqrt(); + if let Some(mask) = mask { + let mask = mask.unsqueeze(1)?; + let scale = Tensor::new(1e9f32, x.device())?; + scores = scores?.broadcast_add(&mask.neg()?.mul(&scale)?); + } + + let scores = scores?; + let attn = candle_nn::ops::softmax(&scores, scores.dims().len() - 1)?; + let context = attn.matmul(&v)?.transpose(1, 2)?.reshape((b, t, self.num_heads * self.head_dim))?; + self.proj_out.forward(&context) + } +} + +#[derive(Debug, Clone)] +pub struct FeedForward { + lin1: Linear, + lin2: Linear, +} + +impl FeedForward { + pub fn new(varbuilder: &VarBuilder, model_dim: usize, ff_dim: usize) -> Result { + Ok(Self { + lin1: linear_from_varbuilder(varbuilder, model_dim, ff_dim, "lin1")?, + lin2: linear_from_varbuilder(varbuilder, ff_dim, model_dim, "lin2")?, + }) + } + + pub fn forward(&self, x: &Tensor) -> Result { + let x = self.lin1.forward(x)?.relu()?; + self.lin2.forward(&x) + } +} + + +fn linear_from_varbuilder( + vb: &VarBuilder, + in_dim: usize, + out_dim: usize, + prefix: &str, +) -> Result { + let weight = vb.get((out_dim, in_dim), &format!("{}.weight", prefix))?; + let bias = vb.get((out_dim,), &format!("{}.bias", prefix)).ok(); + Ok(Linear::new(weight, bias)) +} + + +/// Generate sinusoidal positional encoding like in "Attention is All You Need". +pub fn create_sinusoidal_encoding(seq_len: usize, model_dim: usize, device: &Device) -> Result { + let mut pe = vec![0f32; seq_len * model_dim]; + for pos in 0..seq_len { + for i in 0..model_dim { + let angle = pos as f32 / (10000f32).powf(2. * (i / 2) as f32 / model_dim as f32); + pe[pos * model_dim + i] = if i % 2 == 0 { angle.sin() } else { angle.cos() }; + } + } + Tensor::from_vec(pe, (seq_len, model_dim), device) +} From e44dddda448d72a4c7ff6bf8cbd300f7ef887f12 Mon Sep 17 00:00:00 2001 From: singjc Date: Fri, 9 May 2025 01:01:32 -0400 Subject: [PATCH 15/75] refactor: Update RTCNNLSTMModel forward method to improve performance and readability --- crates/redeem-properties/src/models/rt_cnn_lstm_model.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs b/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs index 89a6cec..e7ae329 100644 --- a/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs +++ b/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs @@ -271,7 +271,7 @@ impl ModelInterface for RTCNNLSTMModel { #[cfg(test)] mod tests { - use crate::models::model_interface::ModelInterface; + use crate::models::model_interface::{ModelInterface, PredictionResult}; use crate::models::rt_cnn_lstm_model::RTCNNLSTMModel; use candle_core::Device; use std::path::PathBuf; From 999ccf51ad6501550c1c44ae4a50d28e234050c7 Mon Sep 17 00:00:00 2001 From: singjc Date: Fri, 9 May 2025 13:27:12 -0400 Subject: [PATCH 16/75] refactor: Add RT-CNN Transformer model and update redeem-properties crate models --- .../src/building_blocks/building_blocks.rs | 87 ++--- .../src/building_blocks/nn.rs | 110 ++++-- .../src/models/ccs_cnn_lstm_model.rs | 5 + crates/redeem-properties/src/models/mod.rs | 1 + .../src/models/model_interface.rs | 348 ++++++++++++++---- .../src/models/ms2_bert_model.rs | 5 + .../src/models/rt_cnn_lstm_model.rs | 5 + .../src/models/rt_cnn_transformer_model.rs | 333 +++++++++++++++++ .../redeem-properties/src/models/rt_model.rs | 4 +- .../src/utils/peptdeep_utils.rs | 30 ++ crates/redeem-properties/src/utils/utils.rs | 58 +++ 11 files changed, 846 insertions(+), 140 deletions(-) create mode 100644 crates/redeem-properties/src/models/rt_cnn_transformer_model.rs diff --git a/crates/redeem-properties/src/building_blocks/building_blocks.rs b/crates/redeem-properties/src/building_blocks/building_blocks.rs index d1afa48..2c3fc28 100644 --- a/crates/redeem-properties/src/building_blocks/building_blocks.rs +++ b/crates/redeem-properties/src/building_blocks/building_blocks.rs @@ -2,6 +2,7 @@ use anyhow::{Context, Result as AnyHowResult}; use candle_core::{DType, Device, Module, Result, Tensor, D}; use candle_nn as nn; use candle_transformers as transformers; +use serde::de; use core::num; use std::fmt; use std::time::Instant; @@ -26,16 +27,18 @@ pub struct DecoderLinear { impl DecoderLinear { pub fn new(in_features: usize, out_features: usize, vb: &nn::VarBuilder) -> Result { - let weight = Tensor::zeros((in_features, 64), DType::F32, vb.device())?; - let bias = Tensor::zeros(64, DType::F32, vb.device())?; + // First linear layer: in_features -> 64 + let weight1 = Tensor::zeros((64, in_features), DType::F32, vb.device())?; + let bias1 = Tensor::zeros(64, DType::F32, vb.device())?; + let linear1 = nn::Linear::new(weight1, Some(bias1)); - let linear1 = nn::Linear::new(weight, Some(bias)); + // Activation let prelu = nn::PReLU::new(Tensor::zeros(64, DType::F32, vb.device())?, false); - let weight = Tensor::zeros((64, out_features), DType::F32, vb.device())?; - let bias = Tensor::zeros(64, DType::F32, vb.device())?; - - let linear2 = nn::Linear::new(weight, Some(bias)); + // Second linear layer: 64 -> out_features + let weight2 = Tensor::zeros((out_features, 64), DType::F32, vb.device())?; + let bias2 = Tensor::zeros(out_features, DType::F32, vb.device())?; + let linear2 = nn::Linear::new(weight2, Some(bias2)); let mut nn = seq(); nn = nn.add(linear1); @@ -73,10 +76,21 @@ impl DecoderLinear { impl Module for DecoderLinear { fn forward(&self, x: &Tensor) -> Result { - self.nn.forward(x) + log::trace!("[DecoderLinear] input shape: {:?}", x.shape()); + match self.nn.forward(x) { + Ok(output) => { + log::trace!("[DecoderLinear] output shape: {:?}", output.shape()); + Ok(output) + } + Err(e) => { + log::error!("[DecoderLinear] forward pass failed: {:?}", e); + Err(e) + } + } } } + impl fmt::Debug for DecoderLinear { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("DecoderLinear") @@ -198,10 +212,9 @@ struct ModEmbeddingFixFirstK { } impl ModEmbeddingFixFirstK { - fn new(mod_feature_size: usize, out_features: usize, device: &Device) -> Result { + fn new(mod_feature_size: usize, out_features: usize, varbuilder: &nn::VarBuilder) -> Result { let k = 6; - let vb = nn::VarBuilder::zeros(DType::F32, device); - let nn = nn::linear(mod_feature_size - k, out_features - k, vb.pp("linear"))?; + let nn = nn::linear(mod_feature_size - k, out_features - k, varbuilder.pp("linear"))?; Ok(Self { k, nn }) } @@ -243,17 +256,8 @@ pub struct Input26aaModPositionalEncoding { } impl Input26aaModPositionalEncoding { - fn new(out_features: usize, max_len: usize, device: &Device) -> Result { - let mod_hidden = 8; - let mod_nn = ModEmbeddingFixFirstK::new(MOD_FEATURE_SIZE, mod_hidden, device)?; - let aa_emb = AAEmbedding::new(out_features - mod_hidden, device)?; - let pos_encoder = PositionalEncoding::new(out_features, max_len, device)?; - - Ok(Self { - mod_nn, - aa_emb, - pos_encoder, - }) + fn new(_out_features: usize, _max_len: usize, _device: &Device) -> Result { + todo!("new untrained instance of Input26aaModPositionalEncoding not implemented"); } pub fn from_varstore( @@ -586,10 +590,7 @@ struct SeqCNN { } impl SeqCNN { - pub fn new(embedding_hidden: usize, device: &Device) -> Result { - let varmap = nn::VarMap::new(); - let varbuilder = nn::VarBuilder::from_varmap(&varmap, DType::F32, device); - + pub fn new(embedding_hidden: usize, varbuilder: &nn::VarBuilder) -> Result { let cnn_short = nn::conv1d( embedding_hidden, embedding_hidden, @@ -730,6 +731,7 @@ impl SeqTransformer { /// Construct a new transformer encoder block for sequence modeling. /// /// # Arguments + /// * `varbuilder` - The variable builder for creating the model parameters. /// * `input_dim` - The input embedding dimension (e.g., CNN output). /// * `model_dim` - The internal model dimension of the transformer. /// * `ff_dim` - The feedforward hidden layer dimension. @@ -739,26 +741,25 @@ impl SeqTransformer { /// * `dropout_prob` - Dropout probability. /// * `device` - The device to place the tensors on. pub fn new( + varbuilder: &nn::VarBuilder, input_dim: usize, model_dim: usize, ff_dim: usize, num_heads: usize, num_layers: usize, max_len: usize, - dropout_prob: f32, + dropout: f32, device: &Device, ) -> Result { - let varmap = nn::VarMap::new(); - let varbuilder = nn::VarBuilder::from_varmap(&varmap, DType::F32, device); let encoder = TransformerEncoder::new( - &varbuilder, + varbuilder, input_dim, model_dim, ff_dim, num_heads, num_layers, max_len, - dropout_prob, + dropout, device, )?; Ok(Self { encoder, training: true }) @@ -809,9 +810,7 @@ struct SeqAttentionSum { } impl SeqAttentionSum { - pub fn new(hidden_dim: usize, device: &Device) -> Result { - let varmap = nn::VarMap::new(); - let varbuilder = nn::VarBuilder::from_varmap(&varmap, DType::F32, device); + pub fn new(hidden_dim: usize, varbuilder: &nn::VarBuilder) -> Result { let attention = nn::Linear::new( varbuilder.get((1, hidden_dim), "attention.weight")?, None, @@ -1068,7 +1067,7 @@ impl Encoder26aaModCnnTransformerAttnSum { /// Construct a CNN+Transformer+Attention encoder from scratch (no pretrained weights). pub fn new( - device: &Device, + varbuilder: &nn::VarBuilder, mod_hidden_dim: usize, hidden_dim: usize, ff_dim: usize, @@ -1076,12 +1075,14 @@ impl Encoder26aaModCnnTransformerAttnSum { num_layers: usize, max_len: usize, dropout_prob: f32, + device: &Device, ) -> Result { let input_dim = AA_EMBEDDING_SIZE + mod_hidden_dim; Ok(Self { - mod_nn: ModEmbeddingFixFirstK::new(MOD_FEATURE_SIZE, mod_hidden_dim, device)?, - input_cnn: SeqCNN::new(input_dim, device)?, + mod_nn: ModEmbeddingFixFirstK::new(MOD_FEATURE_SIZE, mod_hidden_dim, &varbuilder.pp("mod_nn"))?, + input_cnn: SeqCNN::new(input_dim, &varbuilder.pp("input_cnn"))?, input_transformer: SeqTransformer::new( + &varbuilder.pp("input_transformer"), input_dim * 4, hidden_dim, ff_dim, @@ -1091,32 +1092,32 @@ impl Encoder26aaModCnnTransformerAttnSum { dropout_prob, device, )?, - attn_sum: SeqAttentionSum::new(hidden_dim, device)?, + attn_sum: SeqAttentionSum::new(hidden_dim, &varbuilder.pp("attn_sum"))?, }) } pub fn forward(&self, aa_indices: &Tensor, mod_x: &Tensor) -> Result { let start_time = Instant::now(); let mod_x = self.mod_nn.forward(mod_x)?; - println!("Encoder26aaModCnnTransformerAttnSum::forward - mod_x forward time: {:.3?}", start_time.elapsed()); + log::trace!("[Encoder26aaModCnnTransformerAttnSum::forward] - mod_x forward time: {:.3?}", start_time.elapsed()); let additional_tensors: Vec<&Tensor> = vec![&mod_x]; let start_time = Instant::now(); let x = aa_one_hot(aa_indices, &additional_tensors) .map_err(|e| candle_core::Error::Msg(e.to_string()))?; - println!("Encoder26aaModCnnTransformerAttnSum::forward - aa_one_hot forward time: {:.3?}", start_time.elapsed()); + log::trace!("[Encoder26aaModCnnTransformerAttnSum::forward] - aa_one_hot forward time: {:.3?}", start_time.elapsed()); let start_time = Instant::now(); let x = self.input_cnn.forward(&x)?; - println!("Encoder26aaModCnnTransformerAttnSum::forward - input_cnn forward time: {:.3?}", start_time.elapsed()); + log::trace!("[Encoder26aaModCnnTransformerAttnSum::forward] - input_cnn forward time: {:.3?}", start_time.elapsed()); let start_time = Instant::now(); let x = self.input_transformer.forward(&x)?; - println!("Encoder26aaModCnnTransformerAttnSum::forward - input_transformer forward time: {:.3?}", start_time.elapsed()); + log::trace!("[Encoder26aaModCnnTransformerAttnSum::forward] - input_transformer forward time: {:.3?}", start_time.elapsed()); let start_time = Instant::now(); let x = self.attn_sum.forward(&x)?; - println!("Encoder26aaModCnnTransformerAttnSum::forward - attn_sum forward time: {:.3?}", start_time.elapsed()); + log::trace!("[Encoder26aaModCnnTransformerAttnSum::forward] - attn_sum forward time: {:.3?}", start_time.elapsed()); Ok(x) } diff --git a/crates/redeem-properties/src/building_blocks/nn.rs b/crates/redeem-properties/src/building_blocks/nn.rs index 1373032..cbab40e 100644 --- a/crates/redeem-properties/src/building_blocks/nn.rs +++ b/crates/redeem-properties/src/building_blocks/nn.rs @@ -96,7 +96,6 @@ impl TransformerEncoder { for i in 0..num_layers { let layer = TransformerEncoderLayer::new( &varbuilder.pp(&format!("layer_{}", i)), - input_dim, model_dim, ff_dim, num_heads, @@ -110,14 +109,26 @@ impl TransformerEncoder { } pub fn forward_with_mask(&self, x: &Tensor, padding_mask: Option<&Tensor>, training: bool) -> Result { + log::trace!("[TransformerEncoder] input x shape: {:?}", x.shape()); + let (b, t, _) = x.dims3()?; - let pe = self.pos_encoding.i((..t, ..))?.unsqueeze(0)?.broadcast_as((b, t, self.pos_encoding.dim(1)?))?; - let mut out = x + pe; - out = self.dropout.forward(&out?, training); - for layer in &self.layers { - out = layer.forward(&out?, padding_mask, training); + let pe = self.pos_encoding.i((..t, ..))? + .unsqueeze(0)? + .broadcast_as((b, t, self.pos_encoding.dim(1)?))?; + + log::trace!("[TransformerEncoder] positional encoding shape: {:?}", pe.shape()); + + let mut out = x.broadcast_add(&pe)?; + out = self.dropout.forward(&out, training)?; + + log::trace!("[TransformerEncoder] after dropout shape: {:?}", out.shape()); + + for (i, layer) in self.layers.iter().enumerate() { + log::trace!("[TransformerEncoder] applying layer {}", i); + out = layer.forward(&out, padding_mask, training)?; + log::trace!("[TransformerEncoder] output shape after layer {}: {:?}", i, out.shape()); } - Ok(out?) + Ok(out) } } @@ -134,38 +145,41 @@ pub struct TransformerEncoderLayer { impl TransformerEncoderLayer { pub fn new( varbuilder: &VarBuilder, - input_dim: usize, model_dim: usize, ff_dim: usize, num_heads: usize, dropout_prob: f32, ) -> Result { Ok(Self { - self_attn: MultiHeadAttention::new(varbuilder, input_dim, model_dim, num_heads)?, + self_attn: MultiHeadAttention::new(varbuilder, model_dim, model_dim, num_heads)?, ff: FeedForward::new(varbuilder, model_dim, ff_dim)?, norm1: { let weight = varbuilder.get((model_dim,), "norm1.weight")?; let bias = varbuilder.get((model_dim,), "norm1.bias")?; LayerNorm::new(weight, bias, 1e-5) - }, + }, norm2: { let weight = varbuilder.get((model_dim,), "norm2.weight")?; let bias = varbuilder.get((model_dim,), "norm2.bias")?; LayerNorm::new(weight, bias, 1e-5) - }, + }, dropout1: Dropout::new(dropout_prob), dropout2: Dropout::new(dropout_prob), }) } pub fn forward(&self, x: &Tensor, mask: Option<&Tensor>, training: bool) -> Result { + log::trace!("[TransformerEncoderLayer] input x shape: {:?}", x.shape()); let attn = self.self_attn.forward(x, mask)?; - let x = self.norm1.forward(&(x + self.dropout1.forward(&attn, training)?)?)?; + let x = self.norm1.forward(&x.broadcast_add(&self.dropout1.forward(&attn, training)?)?)?; let ff = self.ff.forward(&x)?; - self.norm2.forward(&(x + self.dropout2.forward(&ff, training)?)?) + let result = self.norm2.forward(&x.broadcast_add(&self.dropout2.forward(&ff, training)?)?)?; + log::trace!("[TransformerEncoderLayer] output shape: {:?}", result.shape()); + Ok(result) } } + #[derive(Debug, Clone)] pub struct MultiHeadAttention { proj_q: Linear, @@ -188,7 +202,7 @@ impl MultiHeadAttention { proj_q: linear_from_varbuilder(varbuilder, input_dim, model_dim, "proj_q")?, proj_k: linear_from_varbuilder(varbuilder, input_dim, model_dim, "proj_k")?, proj_v: linear_from_varbuilder(varbuilder, input_dim, model_dim, "proj_v")?, - proj_out: linear_from_varbuilder(varbuilder, input_dim, model_dim, "proj_out")?, + proj_out: linear_from_varbuilder(varbuilder, model_dim, model_dim, "proj_out")?, num_heads, head_dim, }) @@ -196,20 +210,69 @@ impl MultiHeadAttention { pub fn forward(&self, x: &Tensor, mask: Option<&Tensor>) -> Result { let (b, t, _) = x.dims3()?; - let q = self.proj_q.forward(x)?.reshape((b, t, self.num_heads, self.head_dim))?.transpose(1, 2)?; - let k = self.proj_k.forward(x)?.reshape((b, t, self.num_heads, self.head_dim))?.transpose(1, 2)?; - let v = self.proj_v.forward(x)?.reshape((b, t, self.num_heads, self.head_dim))?.transpose(1, 2)?; + log::trace!("[MultiHeadAttention] Input shape: b={}, t={}, head_dim={} (num_heads={})", b, t, self.head_dim, self.num_heads); + + let q = self.proj_q.forward(x)? + .reshape((b, t, self.num_heads, self.head_dim))? + .transpose(1, 2)? + .contiguous()?; + + let k = self.proj_k.forward(x)? + .reshape((b, t, self.num_heads, self.head_dim))? + .transpose(1, 2)? + .contiguous()?; + + let v = self.proj_v.forward(x)? + .reshape((b, t, self.num_heads, self.head_dim))? + .transpose(1, 2)? + .contiguous()?; + + + log::trace!("[MultiHeadAttention] Q/K/V shape after projection and transpose: {:?}", q.shape()); + + let k_t = k.transpose(2, 3)?.contiguous()?; + let mut scores = q.matmul(&k_t)? / (self.head_dim as f64).sqrt(); + + let mut scores = match q.matmul(&k_t) { + Ok(s) => (s / (self.head_dim as f64).sqrt())?, + Err(e) => { + log::error!("[MultiHeadAttention] Failed during matmul for scores: {}", e); + return Err(e.into()); + } + }; + + log::trace!("[MultiHeadAttention] Attention score shape: {:?}", scores.shape()); - let mut scores = q.matmul(&k.transpose(2, 3)?)? / (self.head_dim as f64).sqrt(); if let Some(mask) = mask { + log::trace!("[MultiHeadAttention] Applying mask"); let mask = mask.unsqueeze(1)?; let scale = Tensor::new(1e9f32, x.device())?; - scores = scores?.broadcast_add(&mask.neg()?.mul(&scale)?); + scores = match scores.broadcast_add(&mask.neg()?.mul(&scale)?) { + Ok(s) => s, + Err(e) => { + log::error!("[MultiHeadAttention] Failed during masking: {}", e); + return Err(e.into()); + } + }; } - let scores = scores?; - let attn = candle_nn::ops::softmax(&scores, scores.dims().len() - 1)?; - let context = attn.matmul(&v)?.transpose(1, 2)?.reshape((b, t, self.num_heads * self.head_dim))?; + let attn = match candle_nn::ops::softmax(&scores, scores.dims().len() - 1) { + Ok(a) => a, + Err(e) => { + log::error!("[MultiHeadAttention] Failed during softmax: {}", e); + return Err(e.into()); + } + }; + + let context = match attn.matmul(&v) { + Ok(ctx) => ctx.transpose(1, 2)?.reshape((b, t, self.num_heads * self.head_dim))?, + Err(e) => { + log::error!("[MultiHeadAttention] Failed during attention context computation: {}", e); + return Err(e.into()); + } + }; + + log::trace!("[MultiHeadAttention] Final context shape: {:?}", context.shape()); self.proj_out.forward(&context) } } @@ -234,7 +297,6 @@ impl FeedForward { } } - fn linear_from_varbuilder( vb: &VarBuilder, in_dim: usize, @@ -246,7 +308,6 @@ fn linear_from_varbuilder( Ok(Linear::new(weight, bias)) } - /// Generate sinusoidal positional encoding like in "Attention is All You Need". pub fn create_sinusoidal_encoding(seq_len: usize, model_dim: usize, device: &Device) -> Result { let mut pe = vec![0f32; seq_len * model_dim]; @@ -258,3 +319,4 @@ pub fn create_sinusoidal_encoding(seq_len: usize, model_dim: usize, device: &Dev } Tensor::from_vec(pe, (seq_len, model_dim), device) } + diff --git a/crates/redeem-properties/src/models/ccs_cnn_lstm_model.rs b/crates/redeem-properties/src/models/ccs_cnn_lstm_model.rs index 4a06304..463c126 100644 --- a/crates/redeem-properties/src/models/ccs_cnn_lstm_model.rs +++ b/crates/redeem-properties/src/models/ccs_cnn_lstm_model.rs @@ -58,6 +58,11 @@ impl ModelInterface for CCSCNNLSTMModel { "ccs_cnn_lstm" } + fn new_untrained(_device: Device) -> Result + { + unimplemented!("Untrained model creation is not implemented for this architecture."); + } + /// Create a new CCSCNNLSTMModel instance model from the given model and constants files. fn new>( model_path: P, diff --git a/crates/redeem-properties/src/models/mod.rs b/crates/redeem-properties/src/models/mod.rs index 9315e42..502cb15 100644 --- a/crates/redeem-properties/src/models/mod.rs +++ b/crates/redeem-properties/src/models/mod.rs @@ -5,3 +5,4 @@ pub mod ccs_cnn_lstm_model; pub mod ccs_model; pub mod ms2_model; pub mod model_interface; +pub mod rt_cnn_transformer_model; diff --git a/crates/redeem-properties/src/models/model_interface.rs b/crates/redeem-properties/src/models/model_interface.rs index b0e7655..8e614ef 100644 --- a/crates/redeem-properties/src/models/model_interface.rs +++ b/crates/redeem-properties/src/models/model_interface.rs @@ -1,5 +1,5 @@ use crate::{ - building_blocks::featurize::{self, aa_indices_tensor, get_mod_features_from_parsed}, + building_blocks::featurize::{self, aa_indices_tensor, get_mod_features_from_parsed}, models::{ccs_model::CCSModelWrapper, ms2_model::MS2ModelWrapper, rt_model::RTModelWrapper}, utils::{ data_handling::PeptideData, @@ -7,7 +7,7 @@ use crate::{ peptdeep_utils::{ get_modification_indices, get_modification_string, parse_instrument_index, remove_mass_shift, - } + }, }, }; use anyhow::{Context, Result}; @@ -19,31 +19,33 @@ use std::ops::Index; use std::path::Path; use std::{collections::HashMap, path::PathBuf}; - // Constants const CHARGE_FACTOR: f64 = 0.1; const NCE_FACTOR: f64 = 0.01; - /// Load tensors from a model file. -/// +/// /// Supported model formats include: /// - PyTorch (.pt, .pth, .pkl) /// - SafeTensors (.safetensors) -/// +/// /// # Arguments /// * `model_path` - Path to the model file. /// * `device` - Device to load the tensors on. -/// +/// /// # Returns /// A vector of tuples containing the tensor names and their corresponding tensors. -pub fn load_tensors_from_model>(model_path: P, device: &Device) -> Result> { +pub fn load_tensors_from_model>( + model_path: P, + device: &Device, +) -> Result> { let path: &Path = model_path.as_ref(); - let extension = path.extension() + let extension = path + .extension() .and_then(|ext| ext.to_str()) .unwrap_or("") .to_lowercase(); - + match extension.as_str() { "pt" | "pth" | "pkl" => { log::trace!("Loading tensors from PyTorch model: {:?}", path); @@ -65,7 +67,6 @@ pub fn load_tensors_from_model>(model_path: P, device: &Device) - } } - /// Represents the type of property to predict. #[derive(Clone)] pub enum PropertyType { @@ -85,7 +86,7 @@ impl PropertyType { } /// Represents a single prediction value or a matrix of prediction values. -/// +/// /// This enum is used to store the output of a model prediction, which can be a single value or a matrix of values. For example, retention time (RT) and collision cross-section (CCS) predictions are single values, while MS2 intensity predictions are matrices. #[derive(Clone)] pub enum PredictionValue { @@ -97,11 +98,10 @@ impl PredictionValue { // Returns a reference to the element at position (i, j) if it exists pub fn get(&self, i: usize, j: usize) -> Option<&f32> { match self { - PredictionValue::Single(_) => None, + PredictionValue::Single(_) => None, PredictionValue::Matrix(vec) => vec.get(i).and_then(|row| row.get(j)), } } - } impl Index for PredictionValue { @@ -126,9 +126,8 @@ impl Index<(usize, usize)> for PredictionValue { } } - /// Represents the output of a model prediction. -/// +/// /// This enum is used to store the output of a model prediction, which can be a vector of retention times (RT), collision cross-sections (CCS), or a vector matrices of MS2 intensities. #[derive(Debug, Clone)] pub enum PredictionResult { @@ -150,13 +149,13 @@ impl PredictionResult { match self { PredictionResult::RTResult(vec) => PredictionValue::Single(vec[index].clone()), PredictionResult::IMResult(vec) => PredictionValue::Single(vec[index].clone()), - PredictionResult::MS2Result(vec) => PredictionValue::Matrix(vec[index].clone()), + PredictionResult::MS2Result(vec) => PredictionValue::Matrix(vec[index].clone()), } } } /// Populates a mutable `VarMap` instance with tensors. -/// +/// /// # Arguments /// * `var_map` - A mutable reference to a `VarMap` instance. /// * `tensor_data` - A vector of tuples containing the tensor names and their corresponding tensors. @@ -175,12 +174,10 @@ pub fn create_var_map( Ok(()) } - pub trait ModelClone { fn clone_box(&self) -> Box; } - impl ModelClone for T where T: 'static + ModelInterface + Clone + Send + Sync, @@ -190,7 +187,6 @@ where } } - impl Clone for Box { fn clone(&self) -> Self { self.clone_box() @@ -198,17 +194,22 @@ impl Clone for Box { } /// Represents an abstract deep learning model interface. -/// +/// /// This trait defines the methods and properties that a deep learning model must implement to be used for property prediction tasks. pub trait ModelInterface: Send + Sync + ModelClone { - /// Get the property type of the model. fn property_type(&self) -> PropertyType; /// Get the model architecture name. fn model_arch(&self) -> &'static str; - /// Create a new instance of the model. + /// Create a new model instance from scratch (no pretrained weights). + /// This is typically used when training a new model from scratch. + fn new_untrained(device: Device) -> Result + where + Self: Sized; + + /// Create a new instance of the model (given a pretrained model (.pth or .safetensors and constants file). fn new>( model_path: P, constants_path: P, @@ -314,7 +315,8 @@ pub trait ModelInterface: Send + Sync + ModelClone { &vec![c as f64 * CHARGE_FACTOR; seq_len], &[batch_size, seq_len, 1], device, - )?.to_dtype(DType::F32)?; + )? + .to_dtype(DType::F32)?; features.push(charge_tensor); } @@ -323,17 +325,16 @@ pub trait ModelInterface: Send + Sync + ModelClone { &vec![n as f64 * NCE_FACTOR; seq_len], &[batch_size, seq_len, 1], device, - )?.to_dtype(DType::F32)?; + )? + .to_dtype(DType::F32)?; features.push(nce_tensor); } if let Some(instr) = instrument { let instr_idx = parse_instrument_index(instr) as u32; - let instr_tensor = Tensor::from_slice( - &vec![instr_idx; seq_len], - &[batch_size, seq_len, 1], - device, - )?.to_dtype(DType::F32)?; + let instr_tensor = + Tensor::from_slice(&vec![instr_idx; seq_len], &[batch_size, seq_len, 1], device)? + .to_dtype(DType::F32)?; features.push(instr_tensor); } @@ -371,7 +372,9 @@ pub trait ModelInterface: Send + Sync + ModelClone { .collect::>>()?; if tensors.is_empty() { - return Err(anyhow::anyhow!("Encoding batch of peptides failed, the resulting tesnor batch is empty.")); + return Err(anyhow::anyhow!( + "Encoding batch of peptides failed, the resulting tesnor batch is empty." + )); } let max_len = tensors @@ -380,25 +383,168 @@ pub trait ModelInterface: Send + Sync + ModelClone { .max() .unwrap_or(0); - let padded = tensors + let padded = tensors .into_par_iter() .map(|t| { let (_, seq_len, feat_dim) = t.shape().dims3()?; if seq_len < max_len { - let pad = Tensor::zeros(&[1, max_len - seq_len, feat_dim], t.dtype(), t.device())?; + let pad = + Tensor::zeros(&[1, max_len - seq_len, feat_dim], t.dtype(), t.device())?; Tensor::cat(&[&t, &pad], 1) } else { Ok(t) } }) - .map(|res| res.map_err(anyhow::Error::from)) + .map(|res| res.map_err(anyhow::Error::from)) .collect::, _>>()?; Ok(Tensor::cat(&padded, 0)?) } + /// Train the model from scratch using a batch of training data. + /// + /// This method is similar to `fine_tune`, but assumes that the model was created from `new_untrained` + /// and has no pre-existing learned weights. + fn train( + &mut self, + training_data: &Vec, + validation_data: Option<&Vec>, + modifications: HashMap< + (String, Option), + crate::utils::peptdeep_utils::ModificationMap, + >, + batch_size: usize, + learning_rate: f64, + epochs: usize, + ) -> Result<()> { + let num_batches = (training_data.len() + batch_size - 1) / batch_size; + + info!( + "Training {} model from scratch on {} peptide features ({} batches) for {} epochs", + self.get_model_arch(), + training_data.len(), + num_batches, + epochs + ); + + let params = candle_nn::ParamsAdamW { + lr: learning_rate, + ..Default::default() + }; + let mut opt = candle_nn::AdamW::new(self.get_mut_varmap().all_vars(), params)?; + + for epoch in 0..epochs { + let progress = Progress::new(num_batches, &format!("[training] Epoch {}: ", epoch)); + let mut total_loss = 0.0; + + training_data + .chunks(batch_size) + .enumerate() + .try_for_each(|(batch_idx, batch_data)| { + let peptides: Vec = batch_data.iter().map(|p| remove_mass_shift(&p.sequence)).collect(); + let mods: Vec = batch_data.iter().map(|p| get_modification_string(&p.sequence, &modifications)).collect(); + let mod_sites: Vec = batch_data.iter().map(|p| get_modification_indices(&p.sequence)).collect(); + + let charges = batch_data.iter().filter_map(|p| p.charge).collect::>(); + let charges = if charges.len() == batch_data.len() { Some(charges) } else { None }; + + let nces = batch_data.iter().filter_map(|p| p.nce).collect::>(); + let nces = if nces.len() == batch_data.len() { Some(nces) } else { None }; + + let instruments = batch_data.iter().filter_map(|p| p.instrument.clone()).collect::>(); + let instruments = if instruments.len() == batch_data.len() { Some(instruments) } else { None }; + + let input_batch = self.encode_peptides(&peptides, &mods, &mod_sites, charges, nces, instruments)?; + + let batch_targets = match self.property_type() { + PropertyType::RT => PredictionResult::RTResult( + batch_data.iter().map(|p| p.retention_time.unwrap_or_default()).collect(), + ), + PropertyType::CCS => PredictionResult::IMResult( + batch_data.iter().map(|p| p.ion_mobility.unwrap_or_default()).collect(), + ), + PropertyType::MS2 => { + return Err(anyhow::anyhow!("Training from scratch is not yet implemented for MS2")); + } + }; + + let target_batch = match batch_targets { + PredictionResult::RTResult(ref values) | PredictionResult::IMResult(ref values) => { + Tensor::new(values.clone(), &self.get_device())? + } + PredictionResult::MS2Result(_) => unreachable!(), + }; + + let predicted = self.forward(&input_batch)?; + let loss = candle_nn::loss::mse(&predicted, &target_batch)?; + opt.backward_step(&loss)?; + + total_loss += loss.to_vec0::().unwrap_or(999.0); + progress.update_description(&format!("[training] Epoch {}: Loss: {:.4}", epoch, loss.to_vec0::()?)); + progress.inc(); + + Ok(()) + })?; + + // Optional validation evaluation + if let Some(val_data) = validation_data { + let peptides: Vec = val_data.iter().map(|p| remove_mass_shift(&p.sequence)).collect(); + let mods: Vec = val_data.iter().map(|p| get_modification_string(&p.sequence, &modifications)).collect(); + let mod_sites: Vec = val_data.iter().map(|p| get_modification_indices(&p.sequence)).collect(); + + let charges = val_data.iter().filter_map(|p| p.charge).collect::>(); + let charges = if charges.len() == val_data.len() { Some(charges) } else { None }; + + let nces = val_data.iter().filter_map(|p| p.nce).collect::>(); + let nces = if nces.len() == val_data.len() { Some(nces) } else { None }; + + let instruments = val_data.iter().filter_map(|p| p.instrument.clone()).collect::>(); + let instruments = if instruments.len() == val_data.len() { Some(instruments) } else { None }; + + let input_val = self.encode_peptides(&peptides, &mods, &mod_sites, charges, nces, instruments)?; + + let val_targets = match self.property_type() { + PropertyType::RT => PredictionResult::RTResult( + val_data.iter().map(|p| p.retention_time.unwrap_or_default()).collect(), + ), + PropertyType::CCS => PredictionResult::IMResult( + val_data.iter().map(|p| p.ion_mobility.unwrap_or_default()).collect(), + ), + PropertyType::MS2 => { + return Err(anyhow::anyhow!("Validation not supported for MS2 yet")); + } + }; + + let target_val = match val_targets { + PredictionResult::RTResult(ref values) | PredictionResult::IMResult(ref values) => { + Tensor::new(values.clone(), &self.get_device())? + } + PredictionResult::MS2Result(_) => unreachable!(), + }; + + let predicted = self.forward(&input_val)?; + let val_loss = candle_nn::loss::mse(&predicted, &target_val)?; + let val_loss_val = val_loss.to_vec0::()?; + + info!("[validation] Epoch {}: Validation Loss: {:.4}", epoch, val_loss_val); + + let avg_loss = total_loss / num_batches as f32; + progress.update_description(&format!("[training] Epoch {}: Avg. Loss: {:.4} | Val. Loss: {:.4}", epoch, avg_loss, val_loss_val)); + progress.finish(); + } + else + { + let avg_loss = total_loss / num_batches as f32; + progress.update_description(&format!("[training] Epoch {}: Avg. Loss: {:.4}", epoch, avg_loss)); + progress.finish(); + } + } + + Ok(()) + } + /// Fine-tune the model on a batch of training data. - /// + /// /// # Arguments /// * `training_data` - A vector of `PeptideData` instances representing the training data. /// * `modifications` - A map of modifications and their corresponding feature vectors. @@ -426,86 +572,146 @@ pub trait ModelInterface: Send + Sync + ModelClone { full_batches } }; - + info!( "Fine-tuning {} model on {} peptide features ({} batches) for {} epochs", - self.get_model_arch(), training_data.len(), num_batches, epochs + self.get_model_arch(), + training_data.len(), + num_batches, + epochs ); - + let params = candle_nn::ParamsAdamW { lr: learning_rate, ..Default::default() }; let mut opt = candle_nn::AdamW::new(self.get_mut_varmap().all_vars(), params)?; - + for epoch in 0..epochs { let progress = Progress::new(num_batches, &format!("[fine-tuning] Epoch {}: ", epoch)); let mut total_loss = 0.0; - + for batch_idx in 0..num_batches { let start = batch_idx * batch_size; let end = (start + batch_size).min(training_data.len()); let batch_data = &training_data[start..end]; - - let peptides: Vec = batch_data.iter().map(|p| remove_mass_shift(&p.sequence)).collect(); - let mods: Vec = batch_data.iter().map(|p| get_modification_string(&p.sequence, &modifications)).collect(); - let mod_sites: Vec = batch_data.iter().map(|p| get_modification_indices(&p.sequence)).collect(); - - let charges = batch_data.iter().filter_map(|p| p.charge).collect::>(); - let charges = if charges.len() == batch_data.len() { Some(charges) } else { None }; - + + let peptides: Vec = batch_data + .iter() + .map(|p| remove_mass_shift(&p.sequence)) + .collect(); + let mods: Vec = batch_data + .iter() + .map(|p| get_modification_string(&p.sequence, &modifications)) + .collect(); + let mod_sites: Vec = batch_data + .iter() + .map(|p| get_modification_indices(&p.sequence)) + .collect(); + + let charges = batch_data + .iter() + .filter_map(|p| p.charge) + .collect::>(); + let charges = if charges.len() == batch_data.len() { + Some(charges) + } else { + None + }; + let nces = batch_data.iter().filter_map(|p| p.nce).collect::>(); - let nces = if nces.len() == batch_data.len() { Some(nces) } else { None }; - - let instruments = batch_data.iter().filter_map(|p| p.instrument.clone()).collect::>(); - let instruments = if instruments.len() == batch_data.len() { Some(instruments) } else { None }; - - let input_batch = self.encode_peptides(&peptides, &mods, &mod_sites, charges, nces, instruments)?; - - log::trace!("[ModelInterface::fine_tune] input_batch shape: {:?}, device: {:?}", input_batch.shape(), input_batch.device()); - + let nces = if nces.len() == batch_data.len() { + Some(nces) + } else { + None + }; + + let instruments = batch_data + .iter() + .filter_map(|p| p.instrument.clone()) + .collect::>(); + let instruments = if instruments.len() == batch_data.len() { + Some(instruments) + } else { + None + }; + + let input_batch = + self.encode_peptides(&peptides, &mods, &mod_sites, charges, nces, instruments)?; + + log::trace!( + "[ModelInterface::fine_tune] input_batch shape: {:?}, device: {:?}", + input_batch.shape(), + input_batch.device() + ); + let batch_targets = match self.property_type() { PropertyType::RT => PredictionResult::RTResult( - batch_data.iter().map(|p| p.retention_time.unwrap_or_default()).collect(), + batch_data + .iter() + .map(|p| p.retention_time.unwrap_or_default()) + .collect(), ), PropertyType::CCS => PredictionResult::IMResult( - batch_data.iter().map(|p| p.ion_mobility.unwrap_or_default()).collect(), + batch_data + .iter() + .map(|p| p.ion_mobility.unwrap_or_default()) + .collect(), ), PropertyType::MS2 => PredictionResult::MS2Result( - batch_data.iter().map(|p| p.ms2_intensities.clone().unwrap_or_default()).collect(), + batch_data + .iter() + .map(|p| p.ms2_intensities.clone().unwrap_or_default()) + .collect(), ), }; - + let target_batch = match batch_targets { - PredictionResult::RTResult(ref values) | PredictionResult::IMResult(ref values) => { + PredictionResult::RTResult(ref values) + | PredictionResult::IMResult(ref values) => { Tensor::new(values.clone(), &self.get_device())? } PredictionResult::MS2Result(ref spectra) => { let max_len = spectra.iter().map(|s| s.len()).max().unwrap_or(1); - let feature_dim = spectra.get(0).and_then(|s| s.get(0)).map(|v| v.len()).unwrap_or(1); + let feature_dim = spectra + .get(0) + .and_then(|s| s.get(0)) + .map(|v| v.len()) + .unwrap_or(1); let mut padded_spectra = spectra.clone(); for s in &mut padded_spectra { s.resize(max_len, vec![0.0; feature_dim]); } - Tensor::new(padded_spectra.concat(), &self.get_device())?.reshape((batch_data.len(), max_len, feature_dim))? + Tensor::new(padded_spectra.concat(), &self.get_device())?.reshape(( + batch_data.len(), + max_len, + feature_dim, + ))? } }; - + let predicted = self.forward(&input_batch)?; let loss = candle_nn::loss::mse(&predicted, &target_batch)?; opt.backward_step(&loss)?; - + total_loss += loss.to_vec0::().unwrap_or(990.0); - progress.update_description(&format!("[fine-tuning] Epoch {}: Loss: {}", epoch, loss.to_vec0::()?)); + progress.update_description(&format!( + "[fine-tuning] Epoch {}: Loss: {}", + epoch, + loss.to_vec0::()? + )); progress.inc(); } - + let avg_loss = total_loss / num_batches as f32; - progress.update_description(&format!("[fine-tuning] Epoch {}: Avg. Batch Loss: {}", epoch, avg_loss)); + progress.update_description(&format!( + "[fine-tuning] Epoch {}: Avg. Batch Loss: {}", + epoch, avg_loss + )); progress.finish(); } - + Ok(()) } diff --git a/crates/redeem-properties/src/models/ms2_bert_model.rs b/crates/redeem-properties/src/models/ms2_bert_model.rs index 811a50c..42b4d57 100644 --- a/crates/redeem-properties/src/models/ms2_bert_model.rs +++ b/crates/redeem-properties/src/models/ms2_bert_model.rs @@ -60,6 +60,11 @@ impl ModelInterface for MS2BertModel { "ms2_bert" } + fn new_untrained(_device: Device) -> Result + { + unimplemented!("Untrained model creation is not implemented for this architecture."); + } + /// Create a new MS2BERT model from the given model and constants files. fn new>( model_path: P, diff --git a/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs b/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs index e7ae329..e73f6f4 100644 --- a/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs +++ b/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs @@ -47,6 +47,11 @@ impl ModelInterface for RTCNNLSTMModel { "rt_cnn_lstm" } + fn new_untrained(_device: Device) -> Result + { + unimplemented!("Untrained model creation is not implemented for this architecture."); + } + /// Create a new RTCNNLSTMModel from the given model and constants files. fn new>( model_path: P, diff --git a/crates/redeem-properties/src/models/rt_cnn_transformer_model.rs b/crates/redeem-properties/src/models/rt_cnn_transformer_model.rs new file mode 100644 index 0000000..789c127 --- /dev/null +++ b/crates/redeem-properties/src/models/rt_cnn_transformer_model.rs @@ -0,0 +1,333 @@ +use anyhow::Result; +use candle_core::{DType, Device, IndexOp, Tensor}; +use candle_nn::{Dropout, Module, VarBuilder, VarMap}; +use std::collections::HashMap; +use std::path::Path; + + + +use crate::building_blocks::building_blocks::{ + DecoderLinear, Encoder26aaModCnnTransformerAttnSum, MOD_FEATURE_SIZE, +}; +use crate::building_blocks::nn; +use crate::models::model_interface::{ModelInterface, PropertyType, load_tensors_from_model, create_var_map}; +use crate::utils::peptdeep_utils::{ + load_mod_to_feature, + parse_model_constants, ModelConstants, +}; + + +// Main Model Struct + +#[derive(Clone)] +/// Represents an CNN-TF Retention Time model. +pub struct RTCNNTFModel { + var_store: VarBuilder<'static>, + varmap: VarMap, + constants: ModelConstants, + device: Device, + mod_to_feature: HashMap>, + dropout: Dropout, + rt_encoder: Encoder26aaModCnnTransformerAttnSum, + rt_decoder: DecoderLinear, + is_training: bool, +} + +// Automatically implement Send and Sync if all fields are Send and Sync +unsafe impl Send for RTCNNTFModel {} +unsafe impl Sync for RTCNNTFModel {} + +// Core Model Implementation + +impl ModelInterface for RTCNNTFModel { + fn property_type(&self) -> PropertyType { + PropertyType::RT + } + + fn model_arch(&self) -> &'static str { + "rt_cnn_tf" + } + + fn new_untrained(device: Device) -> Result { + let mut varmap = VarMap::new(); + let varbuilder = VarBuilder::from_varmap(&varmap, DType::F32, &device); + + + let rt_encoder = Encoder26aaModCnnTransformerAttnSum::new( + &varbuilder, + 8, // mod_hidden_dim + 140, // hidden_dim + 256, // ff_dim + 4, // num_heads + 2, // num_layers + 100, // max_len + 0.1, // dropout_prob + &device + )?; + + let rt_decoder = DecoderLinear::new(140, 1, &varbuilder)?; + let constants = ModelConstants::default(); + let mod_to_feature = load_mod_to_feature(&constants)?; + + Ok(Self { + var_store: VarBuilder::from_varmap(&varmap, DType::F32, &device), + varmap, + constants, + device, + mod_to_feature, + dropout: Dropout::new(0.1), + rt_encoder, + rt_decoder, + is_training: true, + }) + } + + /// Create a new RTCNNTFModel from the given model and constants files. + fn new>( + model_path: P, + constants_path: P, + _fixed_sequence_len: usize, + _num_frag_types: usize, + _num_modloss_types: usize, + _mask_modloss: bool, + device: Device, + ) -> Result { + let tensor_data = load_tensors_from_model(model_path.as_ref(), &device)?; + let mut varmap = candle_nn::VarMap::new(); + create_var_map(&mut varmap, tensor_data, &device)?; + let var_store = candle_nn::VarBuilder::from_varmap(&varmap, DType::F32, &device); + + let constants: ModelConstants = + parse_model_constants(constants_path.as_ref().to_str().unwrap())?; + + let mod_to_feature = load_mod_to_feature(&constants)?; + let dropout = Dropout::new(0.1); + + let rt_encoder = Encoder26aaModCnnTransformerAttnSum::from_varstore( + &var_store, + 8, // mod_hidden_dim + 140, // hidden_dim + 256, // ff_dim + 4, // num_heads + 2, // num_layers + 100, // max_len (set appropriately for your sequence length) + 0.1, // dropout_prob + vec!["rt_encoder.mod_nn.nn.weight"], + vec![ + "rt_encoder.input_cnn.cnn_short.weight", + "rt_encoder.input_cnn.cnn_medium.weight", + "rt_encoder.input_cnn.cnn_long.weight", + ], + vec![ + "rt_encoder.input_cnn.cnn_short.bias", + "rt_encoder.input_cnn.cnn_medium.bias", + "rt_encoder.input_cnn.cnn_long.bias", + ], + "rt_encoder.input_transformer", + vec!["rt_encoder.attn_sum.attn.0.weight"], + &device, + )?; + + + let rt_decoder = DecoderLinear::from_varstore( + &var_store, + 140, + 1, + vec!["rt_decoder.nn.0.weight", "rt_decoder.nn.1.weight", "rt_decoder.nn.2.weight"], + vec!["rt_decoder.nn.0.bias", "rt_decoder.nn.2.bias"] + )?; + + Ok(Self { + var_store, + varmap, + constants, + device, + mod_to_feature, + dropout, + rt_encoder, + rt_decoder, + is_training: true, + }) + } + + fn forward(&self, xs: &Tensor) -> Result { + let aa_indices_out = xs.i((.., .., 0))?; + let mod_x_out = xs.i((.., .., 1..1 + MOD_FEATURE_SIZE))?; + log::trace!("[RTCNNTFModel] aa_indices_out: {:?}, mod_x_out: {:?}", aa_indices_out, mod_x_out); + let x = self.rt_encoder.forward(&aa_indices_out, &mod_x_out)?; + log::trace!("[RTCNNTFModel] x.shape after rt_encoder: {:?}", x.shape()); + let x = self.dropout.forward(&x, self.is_training)?; + log::trace!("[RTCNNTFModel] x.shape after dropout: {:?}", x.shape()); + let x = self.rt_decoder.forward(&x)?; + log::trace!("[RTCNNTFModel] x.shape after rt_decoder: {:?}", x.shape()); + Ok(x.squeeze(1)?) + } + + /// Set model to evaluation mode for inference + /// This disables dropout and other training-specific layers. + fn set_evaluation_mode(&mut self) { + // println!("Setting evaluation mode"); + self.is_training = false; + } + + /// Set model to training mode for training + /// This enables dropout and other training-specific layers. + fn set_training_mode(&mut self) { + self.is_training = true; + } + + fn get_property_type(&self) -> String { + self.property_type().clone().as_str().to_string() + } + + fn get_model_arch(&self) -> String { + self.model_arch().to_string() + } + + fn get_device(&self) -> &Device { + &self.device + } + + fn get_mod_element_count(&self) -> usize { + self.constants.mod_elements.len() + } + + fn get_mod_to_feature(&self) -> &HashMap> { + &self.mod_to_feature + } + + fn get_min_pred_intensity(&self) -> f32 { + unimplemented!("Method not implemented for architecture: {}", self.model_arch()) + } + + fn get_mut_varmap(&mut self) -> &mut VarMap { + &mut self.varmap + } + + /// Print a summary of the model's constants. + fn print_summary(&self) { + println!("RTModel Summary:"); + println!("AA Embedding Size: {}", self.constants.aa_embedding_size.unwrap()); + println!("Charge Factor: {:?}", self.constants.charge_factor); + println!("Instruments: {:?}", self.constants.instruments); + println!("Max Instrument Num: {}", self.constants.max_instrument_num); + println!("Mod Elements: {:?}", self.constants.mod_elements); + println!("NCE Factor: {:?}", self.constants.nce_factor); + } + + /// Print the model's weights. + fn print_weights(&self) { + todo!("Implement print_weights for RTCNNTFModel"); + } + + +} + +// Module Trait Implementation + +// impl Module for RTCNNLSTMModel { +// fn forward(&self, input: &Tensor) -> Result { +// ModelInterface::forward(self, input) +// } +// } + + +#[cfg(test)] +mod tests { + use crate::models::model_interface::ModelInterface; + use crate::models::rt_cnn_lstm_model::RTCNNLSTMModel; + use candle_core::Device; + use std::path::PathBuf; + + use super::*; + + #[test] + fn test_parse_model_constants() { + let path = "data/models/alphapeptdeep/generic/rt.pth.model_const.yaml"; + let result = parse_model_constants(path); + assert!(result.is_ok()); + let constants = result.unwrap(); + assert_eq!(constants.aa_embedding_size.unwrap(), 27); + assert_eq!(constants.charge_factor, Some(0.1)); + assert_eq!(constants.instruments.len(), 4); + assert_eq!(constants.max_instrument_num, 8); + assert_eq!(constants.mod_elements.len(), 109); + assert_eq!(constants.nce_factor, Some(0.01)); + } + + #[test] + fn test_encode_peptides() { + let model_path = PathBuf::from("data/models/alphapeptdeep/generic/rt.pth"); + let constants_path = + PathBuf::from("data/models/alphapeptdeep/generic/rt.pth.model_const.yaml"); + let device = Device::Cpu; + let model = RTCNNLSTMModel::new(&model_path, &constants_path, 0, 8, 4, true, device).unwrap(); + + let peptide_sequences = "AGHCEWQMKYR"; + let mods = "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M"; + let mod_sites = "0;4;8"; + // let charge = Some(2); + // let nce = Some(20); + // let instrument = Some("QE"); + + let result = + model.encode_peptide(&peptide_sequences, mods, mod_sites, None, None, None); + + println!("{:?}", result); + + // assert!(result.is_ok()); + // let encoded_peptides = result.unwrap(); + // assert_eq!(encoded_peptides.shape().dims2().unwrap(), (1, 27 + 109 + 1 + 1 + 1)); + } + + #[test] + fn test_encode_peptides_batch() { + + let model_path = PathBuf::from("data/models/alphapeptdeep/generic/rt.pth"); + let constants_path = PathBuf::from("data/models/alphapeptdeep/generic/rt.pth.model_const.yaml"); + let device = Device::Cpu; + + let model = RTCNNLSTMModel::new(&model_path, &constants_path, 0, 8, 4, true, device.clone()).unwrap(); + + // Batched input + let peptide_sequences = vec![ + "ACDEFGHIK".to_string(), + "AGHCEWQMKYR".to_string(), + ]; + let mods = vec![ + "Carbamidomethyl@C".to_string(), + "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M".to_string(), + ]; + let mod_sites = vec![ + "1".to_string(), + "0;4;8".to_string(), + ]; + + println!("Peptides: {:?}", peptide_sequences); + println!("Mods: {:?}", mods); + println!("Mod sites: {:?}", mod_sites); + + + let result = model.encode_peptides( + &peptide_sequences, + &mods, + &mod_sites, + None, + None, + None, + ); + + assert!(result.is_ok()); + let tensor = result.unwrap(); + println!("Batched encoded tensor shape: {:?}", tensor.shape()); + + let (batch, seq_len, feat_dim) = tensor.shape().dims3().unwrap(); + assert_eq!(batch, 2); // two peptides + assert!(seq_len >= 11); // padded to max length + assert!(feat_dim > 1); // includes aa + mod features + } + + + + +} diff --git a/crates/redeem-properties/src/models/rt_model.rs b/crates/redeem-properties/src/models/rt_model.rs index d6cc501..dd9bcab 100644 --- a/crates/redeem-properties/src/models/rt_model.rs +++ b/crates/redeem-properties/src/models/rt_model.rs @@ -12,11 +12,11 @@ use crate::utils::peptdeep_utils::ModificationMap; // Enum for different types of retention time models pub enum RTModelArch { RTCNNLSTM, - // Add other architectures here as needed + RTCNNTF } // Constants for different types of retention time models -pub const RTMODEL_ARCHS: &[&str] = &["rt_cnn_lstm"]; +pub const RTMODEL_ARCHS: &[&str] = &["rt_cnn_lstm", "rt_cnn_tf"]; // A wrapper struct for RT models pub struct RTModelWrapper { diff --git a/crates/redeem-properties/src/utils/peptdeep_utils.rs b/crates/redeem-properties/src/utils/peptdeep_utils.rs index 2b2d6f1..8b274b5 100644 --- a/crates/redeem-properties/src/utils/peptdeep_utils.rs +++ b/crates/redeem-properties/src/utils/peptdeep_utils.rs @@ -113,6 +113,36 @@ struct ModFeature { // Add other fields if needed } +impl Default for ModelConstants { + fn default() -> Self { + Self { + aa_embedding_size: Some(27), + charge_factor: Some(0.1), + instruments: vec![ + "QE".into(), + "Lumos".into(), + "timsTOF".into(), + "SciexTOF".into(), + ], + max_instrument_num: 8, + mod_elements: vec![ + "C", "H", "N", "O", "P", "S", "B", "F", "I", "K", "U", "V", "W", "X", "Y", "Ac", + "Ag", "Al", "Am", "Ar", "As", "At", "Au", "Ba", "Be", "Bi", "Bk", "Br", "Ca", "Cd", + "Ce", "Cf", "Cl", "Cm", "Co", "Cr", "Cs", "Cu", "Dy", "Er", "Es", "Eu", "Fe", "Fm", + "Fr", "Ga", "Gd", "Ge", "He", "Hf", "Hg", "Ho", "In", "Ir", "Kr", "La", "Li", "Lr", + "Lu", "Md", "Mg", "Mn", "Mo", "Na", "Nb", "Nd", "Ne", "Ni", "No", "Np", "Os", "Pa", + "Pb", "Pd", "Pm", "Po", "Pr", "Pt", "Pu", "Ra", "Rb", "Re", "Rh", "Rn", "Ru", "Sb", + "Sc", "Se", "Si", "Sm", "Sn", "Sr", "Ta", "Tb", "Tc", "Te", "Th", "Ti", "Tl", "Tm", + "Xe", "Yb", "Zn", "Zr", "2H", "13C", "15N", "18O", "?" + ] + .into_iter() + .map(String::from) + .collect(), + nce_factor: Some(0.01), + } + } +} + /// Parse the model constants from a YAML file. pub fn parse_model_constants(path: &str) -> Result { let f = std::fs::File::open(path).map_err(|e| Error::msg(e.to_string()))?; diff --git a/crates/redeem-properties/src/utils/utils.rs b/crates/redeem-properties/src/utils/utils.rs index e1c105e..34572a6 100644 --- a/crates/redeem-properties/src/utils/utils.rs +++ b/crates/redeem-properties/src/utils/utils.rs @@ -1,6 +1,64 @@ use candle_core::Device; use candle_core::utils::{cuda_is_available, metal_is_available}; use anyhow::{Result, anyhow}; +use std::f64::consts::PI; + +pub trait LRScheduler { + /// Update the learning rate based on the current step + fn step(&mut self); + + /// Get the current learning rate + fn get_last_lr(&self) -> f64; +} + +pub struct CosineWithWarmup { + initial_lr: f64, + current_step: usize, + num_warmup_steps: usize, + num_training_steps: usize, + num_cycles: f64, +} + +impl CosineWithWarmup { + pub fn new( + initial_lr: f64, + num_warmup_steps: usize, + num_training_steps: usize, + num_cycles: f64, + ) -> Self { + Self { + initial_lr, + current_step: 0, + num_warmup_steps: num_warmup_steps, + num_training_steps, + num_cycles, + } + } + + fn get_lr(&self) -> f64 { + if self.current_step < self.num_warmup_steps { + // Linear warmup + return self.initial_lr * (self.current_step as f64) / (self.num_warmup_steps as f64); + } + + let progress = (self.current_step - self.num_warmup_steps) as f64 + / (self.num_training_steps - self.num_warmup_steps) as f64; + + // Cosine decay + let cosine_decay = 0.5 * (1.0 + (PI * self.num_cycles * 2.0 * progress).cos()); + self.initial_lr * cosine_decay.max(1e-10) + } +} + +impl LRScheduler for CosineWithWarmup { + fn step(&mut self) { + self.current_step += 1; + } + + fn get_last_lr(&self) -> f64 { + self.get_lr() + } +} /// Converts a device string to a Candle Device. /// From 50c4a070b8f61a2189a70d1943cad6e69d0010fa Mon Sep 17 00:00:00 2001 From: singjc Date: Fri, 9 May 2025 13:29:20 -0400 Subject: [PATCH 17/75] feat: Add new modules for training and loading data in redeem-cli --- Cargo.toml | 2 +- crates/redeem-cli/Cargo.toml | 27 ++++ crates/redeem-cli/src/lib.rs | 1 + crates/redeem-cli/src/main.rs | 138 ++++++++++++++++++ crates/redeem-cli/src/properties/load_data.rs | 69 +++++++++ crates/redeem-cli/src/properties/mod.rs | 2 + .../redeem-cli/src/properties/train/input.rs | 88 +++++++++++ crates/redeem-cli/src/properties/train/mod.rs | 2 + .../src/properties/train/trainer.rs | 60 ++++++++ 9 files changed, 388 insertions(+), 1 deletion(-) create mode 100644 crates/redeem-cli/Cargo.toml create mode 100644 crates/redeem-cli/src/lib.rs create mode 100644 crates/redeem-cli/src/main.rs create mode 100644 crates/redeem-cli/src/properties/load_data.rs create mode 100644 crates/redeem-cli/src/properties/mod.rs create mode 100644 crates/redeem-cli/src/properties/train/input.rs create mode 100644 crates/redeem-cli/src/properties/train/mod.rs create mode 100644 crates/redeem-cli/src/properties/train/trainer.rs diff --git a/Cargo.toml b/Cargo.toml index c196489..8539475 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [workspace] -members = [ "crates/redeem-classifiers", +members = [ "crates/redeem-classifiers", "crates/redeem-cli", "crates/redeem-properties" ] diff --git a/crates/redeem-cli/Cargo.toml b/crates/redeem-cli/Cargo.toml new file mode 100644 index 0000000..f75a161 --- /dev/null +++ b/crates/redeem-cli/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "redeem-cli" +version = "0.1.0" +edition = "2024" + +[[bin]] +name = "redeem" +path = "src/main.rs" + +[dependencies] +redeem-properties = { path = "../redeem-properties" } +env_logger = "0.11.8" +log = "0.4" +clap = { version="4.0", features = ["cargo", "unicode"] } +anyhow = "1.0" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +csv = "1.1" + +[dependencies.candle-core] +version = "0.8.4" +default-features = false +features = [] + +[features] +default = [] +cuda = ["candle-core/cuda"] \ No newline at end of file diff --git a/crates/redeem-cli/src/lib.rs b/crates/redeem-cli/src/lib.rs new file mode 100644 index 0000000..76ebb50 --- /dev/null +++ b/crates/redeem-cli/src/lib.rs @@ -0,0 +1 @@ +pub mod properties; \ No newline at end of file diff --git a/crates/redeem-cli/src/main.rs b/crates/redeem-cli/src/main.rs new file mode 100644 index 0000000..a6f8874 --- /dev/null +++ b/crates/redeem-cli/src/main.rs @@ -0,0 +1,138 @@ +use clap::{Arg, Command, ArgMatches, ValueHint}; +use log::LevelFilter; +use std::path::PathBuf; +use anyhow::Result; + +use redeem_cli::properties::train::input::{self, PropertyTrainConfig}; +use redeem_cli::properties::train::trainer; + +fn main() -> Result<()> { + env_logger::Builder::default() + .filter_level(LevelFilter::Error) + .parse_env(env_logger::Env::default().filter_or("REDEEM_LOG", "error,redeem=info")) + .init(); + + let matches = Command::new("redeem") + .version(clap::crate_version!()) + .author("Justin Sing ") + .about("\u{1F9EA} ReDeeM CLI - Modular Deep Learning Tools for Proteomics") + .subcommand_required(true) + .arg_required_else_help(true) + .subcommand( + Command::new("properties") + .about("Train or run peptide property prediction models") + .subcommand( + Command::new("train") + .about("Train a new property prediction model from scratch") + .arg( + Arg::new("config") + .help("Path to training configuration file") + .required(true) + .value_parser(clap::value_parser!(PathBuf)) + .value_hint(ValueHint::FilePath), + ) + .arg( + Arg::new("train_data") + .short('d') + .long("train_data") + .value_parser(clap::builder::NonEmptyStringValueParser::new()) + .help( + "Path to training data. Overrides the training data file \ + specified in the configuration file.", + ) + .value_hint(ValueHint::FilePath), + ) + .arg( + Arg::new("validation_data") + .short('v') + .long("validation_data") + .value_parser(clap::builder::NonEmptyStringValueParser::new()) + .help( + "Path to validation data. Overrides the validation data file \ + specified in the configuration file.", + ) + .value_hint(ValueHint::FilePath), + ) + .arg( + Arg::new("output_file") + .short('o') + .long("output_file") + .value_parser(clap::builder::NonEmptyStringValueParser::new()) + .help( + "File path that the safetensors trained model will be written to. \ + Overrides the directory specified in the configuration file.", + ) + .value_hint(ValueHint::FilePath), + ) + .arg( + Arg::new("model_arch") + .short('m') + .long("model_arch") + .help( + "Model architecture to train. \ + Overrides the model architecture specified in the configuration file.", + ) + .value_parser([ + "rt_cnn_lstm", + "rt_cnn_tf", + "ms2_bert", + "ccs_cnn_lstm", + ]) + .required(false) + ) + .help_template( + "{usage-heading} {usage}\n\n\ + {about-with-newline}\n\ + Written by {author-with-newline}Version {version}\n\n\ + {all-args}{after-help}", + ), + ), + ) + .subcommand( + Command::new("classifiers") + .about("Run classification tools such as rescoring") + .subcommand( + Command::new("rescore") + .about("Run rescoring tool with specified configuration") + .arg( + Arg::new("config") + .help("Path to classifier configuration file") + .required(true) + .value_parser(clap::value_parser!(PathBuf)) + .value_hint(ValueHint::FilePath), + ), + ), + ) + .get_matches(); + + match matches.subcommand() { + Some(("properties", sub_m)) => handle_properties(sub_m), + Some(("classifiers", sub_m)) => handle_classifiers(sub_m), + _ => unreachable!("Subcommand is required by CLI configuration"), + } +} + +fn handle_properties(matches: &ArgMatches) -> Result<()> { + match matches.subcommand() { + Some(("train", train_matches)) => { + let config_path: &PathBuf = train_matches.get_one("config").unwrap(); + println!("[ReDeeM::Properties] Training from config: {:?}", config_path); + let params: PropertyTrainConfig = input::PropertyTrainConfig::from_arguments(config_path, train_matches)?; + let _ = trainer::run_training(¶ms); + Ok(()) + } + _ => unreachable!(), + } +} + +fn handle_classifiers(matches: &ArgMatches) -> Result<()> { + match matches.subcommand() { + Some(("rescore", rescore_matches)) => { + let config_path: &PathBuf = rescore_matches.get_one("config").unwrap(); + println!("[ReDeeM::Classifiers] Rescoring using config: {:?}", config_path); + // Call your classifier logic using config_path + Ok(()) + } + _ => unreachable!(), + } +} diff --git a/crates/redeem-cli/src/properties/load_data.rs b/crates/redeem-cli/src/properties/load_data.rs new file mode 100644 index 0000000..7be1cfb --- /dev/null +++ b/crates/redeem-cli/src/properties/load_data.rs @@ -0,0 +1,69 @@ +use std::fs::File; +use std::path::Path; +use std::io::BufReader; +use anyhow::{Result, Context}; +use csv::ReaderBuilder; +use redeem_properties::utils::data_handling::PeptideData; + +/// Load peptide training data from a CSV or TSV file. +/// +/// Automatically determines the delimiter and supports RT models. +/// Currently expects columns: "sequence", "retention time" (others optional). +/// +/// # Arguments +/// * `path` - Path to the input CSV/TSV file +/// +/// # Returns +/// Vector of parsed `PeptideData` records +pub fn load_peptide_data>(path: P) -> Result> { + let file = File::open(&path).with_context(|| format!("Failed to open file: {:?}", path.as_ref()))?; + let reader = BufReader::new(file); + + let is_tsv = path.as_ref().extension().map(|e| e == "tsv").unwrap_or(false); + let delimiter = if is_tsv { b'\t' } else { b',' }; + + let mut rdr = ReaderBuilder::new() + .delimiter(delimiter) + .has_headers(true) + .from_reader(reader); + + let headers = rdr.headers()?.clone(); + + let mut peptides = Vec::new(); + for result in rdr.records() { + let record = result?; + + let sequence = record + .get(headers.iter().position(|h| h == "sequence").unwrap_or(2)) + .unwrap_or("") + .to_string(); + + let retention_time = record + .get(headers.iter().position(|h| h == "retention time").unwrap_or(3)) + .and_then(|s| s.parse::().ok()); + + let charge = record + .get(headers.iter().position(|h| h == "charge").unwrap_or(usize::MAX)) + .and_then(|s| s.parse::().ok()); + + let nce = record + .get(headers.iter().position(|h| h == "nce").unwrap_or(usize::MAX)) + .and_then(|s| s.parse::().ok()); + + let instrument = record + .get(headers.iter().position(|h| h == "instrument").unwrap_or(usize::MAX)) + .map(|s| s.to_string()); + + peptides.push(PeptideData::new( + &sequence, + charge, + nce, + instrument.as_deref(), + retention_time, + None, + None, + )); + } + + Ok(peptides) +} diff --git a/crates/redeem-cli/src/properties/mod.rs b/crates/redeem-cli/src/properties/mod.rs new file mode 100644 index 0000000..eb69af4 --- /dev/null +++ b/crates/redeem-cli/src/properties/mod.rs @@ -0,0 +1,2 @@ +pub mod train; +pub mod load_data; \ No newline at end of file diff --git a/crates/redeem-cli/src/properties/train/input.rs b/crates/redeem-cli/src/properties/train/input.rs new file mode 100644 index 0000000..0a9fe59 --- /dev/null +++ b/crates/redeem-cli/src/properties/train/input.rs @@ -0,0 +1,88 @@ +use serde::Deserialize; +use std::fs; +use std::path::PathBuf; +use clap::ArgMatches; +use anyhow::{Context, Result}; + +#[derive(Debug, Deserialize, Clone)] +pub struct PropertyTrainConfig { + pub train_data: String, + pub validation_data: Option, + pub output_file: String, + pub model_arch: String, + pub device: String, + pub batch_size: usize, + pub learning_rate: f32, + pub epochs: usize, + pub instrument: String, + pub nce: i32, +} + +impl Default for PropertyTrainConfig { + fn default() -> Self { + PropertyTrainConfig { + train_data: String::new(), + validation_data: None, + output_file: String::from("rt_cnn_tf.safetensors"), + model_arch: String::from("rt_cnn_tf"), + device: String::from("cpu"), + batch_size: 64, + learning_rate: 1e-3, + epochs: 10, + instrument: String::from("QE"), + nce: 20, + } + } +} + +impl PropertyTrainConfig { + pub fn from_arguments(config_path: &PathBuf, matches: &ArgMatches) -> Result { + let config_json = fs::read_to_string(config_path) + .with_context(|| format!("Failed to read config file: {:?}", config_path))?; + + let mut config: PropertyTrainConfig = serde_json::from_str(&config_json) + .unwrap_or_else(|_| PropertyTrainConfig::default()); + + // Apply CLI overrides + if let Some(train_data) = matches.get_one::("train_data") { + validate_tsv_or_csv_file(train_data)?; + config.train_data = train_data.clone().to_string(); + } else { + validate_tsv_or_csv_file(&config.train_data)?; + } + + if let Some(validation_data) = matches.get_one::("validation_data") { + validate_tsv_or_csv_file(validation_data)?; + config.validation_data = Some(validation_data.clone().to_string()); + } else if let Some(val_data) = &config.validation_data { + validate_tsv_or_csv_file(val_data)?; + } + + if let Some(output_file) = matches.get_one::("output_file") { + config.output_file = output_file.clone(); + } + + if let Some(model_arch) = matches.get_one::("model_arch") { + config.model_arch = model_arch.clone(); + } + + Ok(config) + } +} + + +pub fn validate_tsv_or_csv_file(path: &str) -> Result<()> { + let pb = PathBuf::from(path); + + let ext = pb.extension().and_then(|s| s.to_str()).map(|s| s.to_lowercase()); + match ext.as_deref() { + Some("tsv") | Some("csv") => {} + _ => anyhow::bail!("File must have a .tsv or .csv extension: {}", path), + } + + if !pb.exists() { + anyhow::bail!("File does not exist: {}", path); + } + + Ok(()) +} diff --git a/crates/redeem-cli/src/properties/train/mod.rs b/crates/redeem-cli/src/properties/train/mod.rs new file mode 100644 index 0000000..d60a05a --- /dev/null +++ b/crates/redeem-cli/src/properties/train/mod.rs @@ -0,0 +1,2 @@ +pub mod input; +pub mod trainer; \ No newline at end of file diff --git a/crates/redeem-cli/src/properties/train/trainer.rs b/crates/redeem-cli/src/properties/train/trainer.rs new file mode 100644 index 0000000..ebd5f8b --- /dev/null +++ b/crates/redeem-cli/src/properties/train/trainer.rs @@ -0,0 +1,60 @@ +use anyhow::{Context, Result}; +use input::PropertyTrainConfig; +use load_data::load_peptide_data; +use redeem_properties::models::model_interface::ModelInterface; +use redeem_properties::models::{rt_cnn_lstm_model::RTCNNLSTMModel, rt_cnn_transformer_model::RTCNNTFModel}; +use redeem_properties::utils::data_handling::PeptideData; +use redeem_properties::utils::peptdeep_utils::load_modifications; +use redeem_properties::utils::utils::get_device; +use std::path::PathBuf; +use candle_core::Device; + +use crate::properties::load_data; + +use super::input; + +pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { + + // Load training data + let train_peptides: Vec = load_peptide_data(&config.train_data)?; + println!("Loaded {} training peptides", train_peptides.len()); + + // Load validation data if specified + let val_peptides = if let Some(ref val_path) = config.validation_data { + Some(load_peptide_data(val_path).context("Failed to load validation data")?) + } else { + None + }; + + if let Some(ref val_data) = val_peptides { + println!("Loaded {} validation peptides", val_data.len()); + } else { + println!("No validation data provided."); + } + + // Dispatch model training based on architecture + let model_arch = config.model_arch.as_str(); + let device = get_device(&config.device)?; + + let mut model: Box = match model_arch { + "rt_cnn_lstm" => Box::new(RTCNNLSTMModel::new_untrained(device.clone())?), + "rt_cnn_tf" => Box::new(RTCNNTFModel::new_untrained(device.clone())?), + _ => return Err(anyhow::anyhow!("Unsupported model architecture: {}", model_arch)), + }; + + let modifications = load_modifications().context("Failed to load modifications")?; + + model.train( + &train_peptides, + val_peptides.as_ref(), + modifications, + config.batch_size, + config.learning_rate as f64, + config.epochs, + )?; + + model.save(&config.output_file)?; + println!("Model saved to: {}", config.output_file); + + Ok(()) +} From b5decf0e7db7752e6a233fabea134059b29126aa Mon Sep 17 00:00:00 2001 From: singjc Date: Fri, 9 May 2025 14:41:35 -0400 Subject: [PATCH 18/75] refactor: Add early stopping to property training --- .../redeem-cli/src/properties/train/input.rs | 2 + .../src/properties/train/trainer.rs | 11 +- .../src/models/model_interface.rs | 115 +++++++++++------- 3 files changed, 80 insertions(+), 48 deletions(-) diff --git a/crates/redeem-cli/src/properties/train/input.rs b/crates/redeem-cli/src/properties/train/input.rs index 0a9fe59..fb5d3f3 100644 --- a/crates/redeem-cli/src/properties/train/input.rs +++ b/crates/redeem-cli/src/properties/train/input.rs @@ -14,6 +14,7 @@ pub struct PropertyTrainConfig { pub batch_size: usize, pub learning_rate: f32, pub epochs: usize, + pub early_stopping_patience: usize, pub instrument: String, pub nce: i32, } @@ -29,6 +30,7 @@ impl Default for PropertyTrainConfig { batch_size: 64, learning_rate: 1e-3, epochs: 10, + early_stopping_patience: 5, instrument: String::from("QE"), nce: 20, } diff --git a/crates/redeem-cli/src/properties/train/trainer.rs b/crates/redeem-cli/src/properties/train/trainer.rs index ebd5f8b..2d23994 100644 --- a/crates/redeem-cli/src/properties/train/trainer.rs +++ b/crates/redeem-cli/src/properties/train/trainer.rs @@ -17,7 +17,7 @@ pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { // Load training data let train_peptides: Vec = load_peptide_data(&config.train_data)?; - println!("Loaded {} training peptides", train_peptides.len()); + log::info!("Loaded {} training peptides", train_peptides.len()); // Load validation data if specified let val_peptides = if let Some(ref val_path) = config.validation_data { @@ -27,9 +27,9 @@ pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { }; if let Some(ref val_data) = val_peptides { - println!("Loaded {} validation peptides", val_data.len()); + log::info!("Loaded {} validation peptides", val_data.len()); } else { - println!("No validation data provided."); + log::warn!("No validation data provided."); } // Dispatch model training based on architecture @@ -44,6 +44,7 @@ pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { let modifications = load_modifications().context("Failed to load modifications")?; + let start_time = std::time::Instant::now(); model.train( &train_peptides, val_peptides.as_ref(), @@ -51,10 +52,12 @@ pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { config.batch_size, config.learning_rate as f64, config.epochs, + config.early_stopping_patience, )?; + log::info!("Training completed in {:?}", start_time.elapsed()); model.save(&config.output_file)?; - println!("Model saved to: {}", config.output_file); + log::info!("Model saved to: {}", config.output_file); Ok(()) } diff --git a/crates/redeem-properties/src/models/model_interface.rs b/crates/redeem-properties/src/models/model_interface.rs index 8e614ef..1817681 100644 --- a/crates/redeem-properties/src/models/model_interface.rs +++ b/crates/redeem-properties/src/models/model_interface.rs @@ -416,6 +416,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { batch_size: usize, learning_rate: f64, epochs: usize, + early_stopping_patience: usize, ) -> Result<()> { let num_batches = (training_data.len() + batch_size - 1) / batch_size; @@ -433,6 +434,9 @@ pub trait ModelInterface: Send + Sync + ModelClone { }; let mut opt = candle_nn::AdamW::new(self.get_mut_varmap().all_vars(), params)?; + let mut best_val_loss = f32::INFINITY; + let mut epochs_without_improvement = 0; + for epoch in 0..epochs { let progress = Progress::new(num_batches, &format!("[training] Epoch {}: ", epoch)); let mut total_loss = 0.0; @@ -488,54 +492,77 @@ pub trait ModelInterface: Send + Sync + ModelClone { // Optional validation evaluation if let Some(val_data) = validation_data { - let peptides: Vec = val_data.iter().map(|p| remove_mass_shift(&p.sequence)).collect(); - let mods: Vec = val_data.iter().map(|p| get_modification_string(&p.sequence, &modifications)).collect(); - let mod_sites: Vec = val_data.iter().map(|p| get_modification_indices(&p.sequence)).collect(); - - let charges = val_data.iter().filter_map(|p| p.charge).collect::>(); - let charges = if charges.len() == val_data.len() { Some(charges) } else { None }; - - let nces = val_data.iter().filter_map(|p| p.nce).collect::>(); - let nces = if nces.len() == val_data.len() { Some(nces) } else { None }; - - let instruments = val_data.iter().filter_map(|p| p.instrument.clone()).collect::>(); - let instruments = if instruments.len() == val_data.len() { Some(instruments) } else { None }; - - let input_val = self.encode_peptides(&peptides, &mods, &mod_sites, charges, nces, instruments)?; + let val_batches = (val_data.len() + batch_size - 1) / batch_size; + use rayon::prelude::*; + + let total_val_loss: f32 = val_data + .par_chunks(batch_size) + .map(|batch_data| { + let peptides: Vec = batch_data.iter().map(|p| remove_mass_shift(&p.sequence)).collect(); + let mods: Vec = batch_data.iter().map(|p| get_modification_string(&p.sequence, &modifications)).collect(); + let mod_sites: Vec = batch_data.iter().map(|p| get_modification_indices(&p.sequence)).collect(); + + let charges = batch_data.iter().filter_map(|p| p.charge).collect::>(); + let charges = if charges.len() == batch_data.len() { Some(charges) } else { None }; + + let nces = batch_data.iter().filter_map(|p| p.nce).collect::>(); + let nces = if nces.len() == batch_data.len() { Some(nces) } else { None }; + + let instruments = batch_data.iter().filter_map(|p| p.instrument.clone()).collect::>(); + let instruments = if instruments.len() == batch_data.len() { Some(instruments) } else { None }; + + let input_val = self.encode_peptides(&peptides, &mods, &mod_sites, charges, nces, instruments); + let input_val = match input_val { + Ok(x) => x, + Err(e) => return Err(e), + }; + + let val_targets = match self.property_type() { + PropertyType::RT => PredictionResult::RTResult( + batch_data.iter().map(|p| p.retention_time.unwrap_or_default()).collect(), + ), + PropertyType::CCS => PredictionResult::IMResult( + batch_data.iter().map(|p| p.ion_mobility.unwrap_or_default()).collect(), + ), + PropertyType::MS2 => { + return Err(anyhow::anyhow!("Validation not supported for MS2 yet")); + } + }; + + let target_val = match val_targets { + PredictionResult::RTResult(ref values) | PredictionResult::IMResult(ref values) => { + Tensor::new(values.clone(), &self.get_device())? + } + PredictionResult::MS2Result(_) => unreachable!(), + }; + + let predicted = self.forward(&input_val)?; + let val_loss = candle_nn::loss::mse(&predicted, &target_val)?; + Ok(val_loss.to_vec0::()?) + }) + .collect::>>()? + .into_iter() + .sum(); + + let avg_val_loss = total_val_loss / val_batches as f32; + let avg_loss = total_loss / num_batches as f32; - let val_targets = match self.property_type() { - PropertyType::RT => PredictionResult::RTResult( - val_data.iter().map(|p| p.retention_time.unwrap_or_default()).collect(), - ), - PropertyType::CCS => PredictionResult::IMResult( - val_data.iter().map(|p| p.ion_mobility.unwrap_or_default()).collect(), - ), - PropertyType::MS2 => { - return Err(anyhow::anyhow!("Validation not supported for MS2 yet")); - } - }; + progress.update_description(&format!("Epoch {}: Avg. Train Loss: {:.4} | Avg. Val. Loss: {:.4}", epoch, avg_loss, avg_val_loss)); + progress.finish(); - let target_val = match val_targets { - PredictionResult::RTResult(ref values) | PredictionResult::IMResult(ref values) => { - Tensor::new(values.clone(), &self.get_device())? + if avg_val_loss < best_val_loss { + best_val_loss = avg_val_loss; + epochs_without_improvement = 0; + } else { + epochs_without_improvement += 1; + if epochs_without_improvement >= early_stopping_patience { + info!("Early stopping triggered after {} epochs without validation loss improvement.", early_stopping_patience); + break; } - PredictionResult::MS2Result(_) => unreachable!(), - }; - - let predicted = self.forward(&input_val)?; - let val_loss = candle_nn::loss::mse(&predicted, &target_val)?; - let val_loss_val = val_loss.to_vec0::()?; - - info!("[validation] Epoch {}: Validation Loss: {:.4}", epoch, val_loss_val); - - let avg_loss = total_loss / num_batches as f32; - progress.update_description(&format!("[training] Epoch {}: Avg. Loss: {:.4} | Val. Loss: {:.4}", epoch, avg_loss, val_loss_val)); - progress.finish(); - } - else - { + } + } else { let avg_loss = total_loss / num_batches as f32; - progress.update_description(&format!("[training] Epoch {}: Avg. Loss: {:.4}", epoch, avg_loss)); + progress.update_description(&format!("Epoch {}: Avg. Train Loss: {:.4}", epoch, avg_loss)); progress.finish(); } } From 37081bafe7582cdc0c6109652d9f9b78b78500c8 Mon Sep 17 00:00:00 2001 From: singjc Date: Fri, 9 May 2025 14:51:19 -0400 Subject: [PATCH 19/75] feat: Add Dockerfile for CUDA-based application containerization --- Dockerfile | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..016d191 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,49 @@ +# Use the official NVIDIA CUDA base image with CUDA 12.2 +FROM nvidia/cuda:12.2.2-devel-ubuntu22.04 + +# Install system dependencies +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + curl \ + libssl-dev \ + pkg-config \ + clang \ + libstdc++-12-dev \ + cmake \ + git \ + && \ + update-ca-certificates && \ + rm -rf /var/lib/apt/lists/* + +# Install Rust using rustup +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y +ENV PATH="/root/.cargo/bin:${PATH}" + +# Set environment variables for CUDA +ENV CUDA_HOME=/usr/local/cuda +ENV PATH=${CUDA_HOME}/bin:${PATH} +ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} + +# Set the CUDA compute capability for the build process +# Tesla V100 has compute capability 7.0 +ENV CUDA_COMPUTE_CAP=70 + +# Set the working directory +WORKDIR /app + +# Copy the source code into the container +COPY . . + +# Update specific dependencies (if needed) +RUN cargo update -p redeem-classifiers + +# Build the application with CUDA support +RUN cargo build --release --bin redeem --features cuda + +# Copy the binary into the PATH +RUN cp target/release/redeem /app/redeem + +# Set the PATH environment variable +ENV PATH="/app:${PATH}" \ No newline at end of file From 1ceb7e69b485af720840570577c313859593c52a Mon Sep 17 00:00:00 2001 From: singjc Date: Fri, 9 May 2025 14:54:56 -0400 Subject: [PATCH 20/75] refactor: Remove unnecessary cargo update command in Dockerfile --- Dockerfile | 3 --- 1 file changed, 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 016d191..4679edb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -36,9 +36,6 @@ WORKDIR /app # Copy the source code into the container COPY . . -# Update specific dependencies (if needed) -RUN cargo update -p redeem-classifiers - # Build the application with CUDA support RUN cargo build --release --bin redeem --features cuda From 480d6c3b253ae9db0172d66222f99f78353d5977 Mon Sep 17 00:00:00 2001 From: singjc Date: Fri, 9 May 2025 15:00:57 -0400 Subject: [PATCH 21/75] refactor: Update dependencies and descriptions in Cargo.toml files --- crates/redeem-classifiers/Cargo.toml | 2 +- crates/redeem-cli/Cargo.toml | 2 ++ crates/redeem-properties/Cargo.toml | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/crates/redeem-classifiers/Cargo.toml b/crates/redeem-classifiers/Cargo.toml index f21af27..3660bf5 100644 --- a/crates/redeem-classifiers/Cargo.toml +++ b/crates/redeem-classifiers/Cargo.toml @@ -6,7 +6,7 @@ edition = "2021" rust-version = "1.76" description = "A repository of deep-learning models for mass spectrometry data" readme = "README.md" -license = "MIT" + [dependencies] anyhow = "1.0" diff --git a/crates/redeem-cli/Cargo.toml b/crates/redeem-cli/Cargo.toml index f75a161..8e2fbd7 100644 --- a/crates/redeem-cli/Cargo.toml +++ b/crates/redeem-cli/Cargo.toml @@ -2,6 +2,8 @@ name = "redeem-cli" version = "0.1.0" edition = "2024" +authors = ["Justin Sing "] +description = "A repository of deep-learning models for mass spectrometry data" [[bin]] name = "redeem" diff --git a/crates/redeem-properties/Cargo.toml b/crates/redeem-properties/Cargo.toml index e39b7c5..56ef10b 100644 --- a/crates/redeem-properties/Cargo.toml +++ b/crates/redeem-properties/Cargo.toml @@ -6,7 +6,7 @@ edition = "2021" rust-version = "1.76" description = "A repository of deep-learning models for mass spectrometry data" readme = "README.md" -license = "MIT" + [dependencies] anyhow = "1.0" From 4b7f92c580bbe485c848a25e3139cd837f09cd90 Mon Sep 17 00:00:00 2001 From: singjc Date: Sat, 10 May 2025 21:44:03 -0400 Subject: [PATCH 22/75] refactor: Update redeem-properties crate models and add new modules for training and loading data in redeem-cli --- .../examples/alphapeptdeep_ccs_cnn_lstm.rs | 2 +- .../examples/alphapeptdeep_ms2_bert.rs | 2 +- .../examples/alphapeptdeep_rt_cnn_lstm.rs | 2 +- .../src/building_blocks/bilstm.rs | 10 +- .../src/building_blocks/building_blocks.rs | 80 ++++-- .../src/building_blocks/nn.rs | 93 ++++--- .../src/models/ccs_cnn_lstm_model.rs | 14 +- .../redeem-properties/src/models/ccs_model.rs | 2 +- .../src/models/model_interface.rs | 244 +++++++++++++----- .../src/models/ms2_bert_model.rs | 14 +- .../redeem-properties/src/models/ms2_model.rs | 2 +- .../src/models/rt_cnn_lstm_model.rs | 25 +- .../src/models/rt_cnn_transformer_model.rs | 48 +++- .../redeem-properties/src/models/rt_model.rs | 17 +- .../src/utils/data_handling.rs | 2 +- crates/redeem-properties/src/utils/logging.rs | 2 +- .../src/utils/peptdeep_utils.rs | 1 + crates/redeem-properties/src/utils/utils.rs | 25 +- 18 files changed, 421 insertions(+), 164 deletions(-) diff --git a/crates/redeem-properties/examples/alphapeptdeep_ccs_cnn_lstm.rs b/crates/redeem-properties/examples/alphapeptdeep_ccs_cnn_lstm.rs index 874a82b..63973da 100644 --- a/crates/redeem-properties/examples/alphapeptdeep_ccs_cnn_lstm.rs +++ b/crates/redeem-properties/examples/alphapeptdeep_ccs_cnn_lstm.rs @@ -91,7 +91,7 @@ fn main() -> Result<()> { println!("Device: {:?}", device); - let mut model = CCSCNNLSTMModel::new(&model_path, &constants_path, 0, 8, 4, true, device) + let mut model = CCSCNNLSTMModel::new(&model_path, Some(&constants_path), 0, 8, 4, true, device) .context("Failed to create CCSCNNLSTMModel")?; // Define training data diff --git a/crates/redeem-properties/examples/alphapeptdeep_ms2_bert.rs b/crates/redeem-properties/examples/alphapeptdeep_ms2_bert.rs index 9e177de..b3ee4b1 100644 --- a/crates/redeem-properties/examples/alphapeptdeep_ms2_bert.rs +++ b/crates/redeem-properties/examples/alphapeptdeep_ms2_bert.rs @@ -139,7 +139,7 @@ fn main() -> Result<()> { println!("Device: {:?}", device); - let mut model = MS2BertModel::new(&model_path, &constants_path, 0, 8, 4, true, device) + let mut model = MS2BertModel::new(&model_path, Some(&constants_path), 0, 8, 4, true, device) .context("Failed to create MS2BertModel")?; // Open the CSV file diff --git a/crates/redeem-properties/examples/alphapeptdeep_rt_cnn_lstm.rs b/crates/redeem-properties/examples/alphapeptdeep_rt_cnn_lstm.rs index aeeb6f3..7408473 100644 --- a/crates/redeem-properties/examples/alphapeptdeep_rt_cnn_lstm.rs +++ b/crates/redeem-properties/examples/alphapeptdeep_rt_cnn_lstm.rs @@ -113,7 +113,7 @@ fn main() -> Result<()> { println!("Device: {:?}", device); - let mut model = RTCNNLSTMModel::new(&model_path, &constants_path, 0, 8, 4, true, device) + let mut model = RTCNNLSTMModel::new(&model_path, Some(&constants_path), 0, 8, 4, true, device) .context("Failed to create RTCNNLSTMModel")?; // Define training data diff --git a/crates/redeem-properties/src/building_blocks/bilstm.rs b/crates/redeem-properties/src/building_blocks/bilstm.rs index b6dd1aa..68d5204 100644 --- a/crates/redeem-properties/src/building_blocks/bilstm.rs +++ b/crates/redeem-properties/src/building_blocks/bilstm.rs @@ -67,7 +67,7 @@ impl BidirectionalLSTM { )?; let last_fw_h = out_fw_states.last().unwrap().h().clone(); let last_fw_c = out_fw_states.last().unwrap().c().clone(); - println!("BidirectionLSTM::apply_bidirectional_layer - Forward LSTM time: {:?}", start_time.elapsed()); + log::trace!("BidirectionLSTM::apply_bidirectional_layer - Forward LSTM time: {:?}", start_time.elapsed()); // Reverse sequence let start_time = std::time::Instant::now(); @@ -78,7 +78,7 @@ impl BidirectionalLSTM { .collect::>>()?, 1, )?; - println!("BidirectionLSTM::apply_bidirectional_layer - Reverse sequence time: {:?}", start_time.elapsed()); + log::trace!("BidirectionLSTM::apply_bidirectional_layer - Reverse sequence time: {:?}", start_time.elapsed()); // Initial states for backward let h0_backward = h0.i(1)?; @@ -93,7 +93,7 @@ impl BidirectionalLSTM { )?; let last_bw_h = out_bw_states.last().unwrap().h().clone(); let last_bw_c = out_bw_states.last().unwrap().c().clone(); - println!("BidirectionLSTM::apply_bidirectional_layer - Backward LSTM time: {:?}", start_time.elapsed()); + log::trace!("BidirectionLSTM::apply_bidirectional_layer - Backward LSTM time: {:?}", start_time.elapsed()); // Combine hidden and cell states let hn = Tensor::stack(&[last_fw_h.clone(), last_bw_h.clone()], 0)?; @@ -118,10 +118,10 @@ impl BidirectionalLSTM { let start_time = std::time::Instant::now(); let (out1, (hn1, cn1)) = self.apply_bidirectional_layer(xs, &self.forward_lstm1, &self.backward_lstm1, &h0_1, &c0_1)?; - println!("BidirectionLSTM::forward_with_state - Layer 1 time: {:?}", start_time.elapsed()); + log::trace!("BidirectionLSTM::forward_with_state - Layer 1 time: {:?}", start_time.elapsed()); let start_time = std::time::Instant::now(); let (out2, (hn2, cn2)) = self.apply_bidirectional_layer(&out1, &self.forward_lstm2, &self.backward_lstm2, &h0_2, &c0_2)?; - println!("BidirectionLSTM::forward_with_state - Layer 2 time: {:?}", start_time.elapsed()); + log::trace!("BidirectionLSTM::forward_with_state - Layer 2 time: {:?}", start_time.elapsed()); let hn = Tensor::cat(&[hn1, hn2], 0)?; let cn = Tensor::cat(&[cn1, cn2], 0)?; diff --git a/crates/redeem-properties/src/building_blocks/building_blocks.rs b/crates/redeem-properties/src/building_blocks/building_blocks.rs index 2c3fc28..edd7250 100644 --- a/crates/redeem-properties/src/building_blocks/building_blocks.rs +++ b/crates/redeem-properties/src/building_blocks/building_blocks.rs @@ -11,6 +11,7 @@ use crate::building_blocks::bilstm::BidirectionalLSTM; use crate::building_blocks::featurize::aa_one_hot; use crate::building_blocks::nn::{BertEncoderModule, ModuleList}; use crate::building_blocks::sequential::{seq, Sequential}; +use crate::utils::utils::get_tensor_stats; use super::nn::TransformerEncoder; @@ -27,26 +28,22 @@ pub struct DecoderLinear { impl DecoderLinear { pub fn new(in_features: usize, out_features: usize, vb: &nn::VarBuilder) -> Result { - // First linear layer: in_features -> 64 - let weight1 = Tensor::zeros((64, in_features), DType::F32, vb.device())?; - let bias1 = Tensor::zeros(64, DType::F32, vb.device())?; - let linear1 = nn::Linear::new(weight1, Some(bias1)); - - // Activation + log::trace!("[DecoderLinear::new] Initializing linear1"); + let linear1 = nn::linear(in_features, 64, vb.pp("nn.0"))?; + log::trace!("[DecoderLinear::new] Initializing prelu"); let prelu = nn::PReLU::new(Tensor::zeros(64, DType::F32, vb.device())?, false); - - // Second linear layer: 64 -> out_features - let weight2 = Tensor::zeros((out_features, 64), DType::F32, vb.device())?; - let bias2 = Tensor::zeros(out_features, DType::F32, vb.device())?; - let linear2 = nn::Linear::new(weight2, Some(bias2)); - + log::trace!("[DecoderLinear::new] Initializing linear2"); + let linear2 = nn::linear(64, out_features, vb.pp("nn.2"))?; + log::trace!("[DecoderLinear::new] Initializing sequential"); let mut nn = seq(); nn = nn.add(linear1); nn = nn.add(prelu); nn = nn.add(linear2); - + Ok(Self { nn }) } + + pub fn from_varstore( varstore: &nn::VarBuilder, @@ -80,6 +77,12 @@ impl Module for DecoderLinear { match self.nn.forward(x) { Ok(output) => { log::trace!("[DecoderLinear] output shape: {:?}", output.shape()); + log::trace!( + "[DecoderLinear] output stats - min: {:.4}, max: {:.4}, mean: {:.4}", + output.min_all()?.to_vec0::()?, + output.max_all()?.to_vec0::()?, + output.mean_all()?.to_vec0::()?, + ); Ok(output) } Err(e) => { @@ -798,6 +801,15 @@ impl SeqTransformer { impl Module for SeqTransformer { fn forward(&self, x: &Tensor) -> Result { + // Add check to ensure input feature dim matches expected model dim + let (_b, _t, d) = x.dims3()?; + let model_dim = self.encoder.model_dim; + if d != model_dim { + return Err(candle_core::Error::Msg(format!( + "SeqTransformer received input with dim {} but expected {}", + d, model_dim + ))); + } self.encoder.forward_with_mask(x, None, self.training) } } @@ -902,23 +914,31 @@ impl Encoder26aaModCnnLstmAttnSum { let start_time = Instant::now(); let mod_x = self.mod_nn.forward(mod_x)?; - println!("Encoder26aaModCnnLstmAttnSum::forward - mod_x forward time: {:.3?}", start_time.elapsed()); + log::trace!("Encoder26aaModCnnLstmAttnSum::forward - mod_x forward time: {:.3?}", start_time.elapsed()); let start_time = Instant::now(); let additional_tensors: Vec<&Tensor> = vec![&mod_x]; - println!("Encoder26aaModCnnLstmAttnSum::forward - additional_tensors forward time: {:.3?}", start_time.elapsed()); + log::trace!("Encoder26aaModCnnLstmAttnSum::forward - additional_tensors forward time: {:.3?}", start_time.elapsed()); let start_time = Instant::now(); let x = aa_one_hot(&aa_indices, &additional_tensors) .map_err(|e| candle_core::Error::Msg(e.to_string()))?; - println!("Encoder26aaModCnnLstmAttnSum::forward - aa_one_hot forward time: {:.3?}", start_time.elapsed()); + log::trace!("Encoder26aaModCnnLstmAttnSum::forward - aa_one_hot forward time: {:.3?}", start_time.elapsed()); + let (mean, min, max) = get_tensor_stats(&x)?; + log::debug!("[Encoder26aaModCnnLstmAttnSum] one-hot output stats - min: {min}, max: {max}, mean: {mean}"); let start_time = Instant::now(); let x = self.input_cnn.forward(&x)?; - println!("Encoder26aaModCnnLstmAttnSum::forward - input_cnn forward time: {:.3?}", start_time.elapsed()); + log::trace!("Encoder26aaModCnnLstmAttnSum::forward - input_cnn forward time: {:.3?}", start_time.elapsed()); + let (mean, min, max) = get_tensor_stats(&x)?; + log::debug!("[Encoder26aaModCnnLstmAttnSum] CNN output stats - min: {min}, max: {max}, mean: {mean}"); let start_time = Instant::now(); let x = self.input_lstm.forward(&x)?; - println!("Encoder26aaModCnnLstmAttnSum::forward - input_lstm forward time: {:.3?}", start_time.elapsed()); + log::trace!("Encoder26aaModCnnLstmAttnSum::forward - input_lstm forward time: {:.3?}", start_time.elapsed()); + let (mean, min, max) = get_tensor_stats(&x)?; + log::debug!("[Encoder26aaModCnnLstmAttnSum] LSTM output stats - min: {min}, max: {max}, mean: {mean}"); let start_time = Instant::now(); let x = self.attn_sum.forward(&x)?; - println!("Encoder26aaModCnnLstmAttnSum::forward - attn_sum forward time: {:.3?}", start_time.elapsed()); + log::trace!("Encoder26aaModCnnLstmAttnSum::forward - attn_sum forward time: {:.3?}", start_time.elapsed()); + let (mean, min, max) = get_tensor_stats(&x)?; + log::debug!("[Encoder26aaModCnnLstmAttnSum] AttentionSum output stats - min: {min}, max: {max}, mean: {mean}"); Ok(x) } } @@ -980,27 +1000,27 @@ impl Encoder26aaModChargeCnnLstmAttnSum { let start_time = Instant::now(); let mod_x = self.mod_nn.forward(mod_x)?; - println!("Encoder26aaModChargeCnnLstmAttnSum::forward - mod_x forward time: {:.3?}", start_time.elapsed()); + log::trace!("Encoder26aaModChargeCnnLstmAttnSum::forward - mod_x forward time: {:.3?}", start_time.elapsed()); let start_time = Instant::now(); let charges_repeated = charges.unsqueeze(1)?.repeat(&[1, mod_x.dim(1)?, 1])?; - println!("Encoder26aaModChargeCnnLstmAttnSum::forward - charges_repeated forward time: {:.3?}", start_time.elapsed()); + log::trace!("Encoder26aaModChargeCnnLstmAttnSum::forward - charges_repeated forward time: {:.3?}", start_time.elapsed()); let start_time = Instant::now(); let additional_tensors: Vec<&Tensor> = vec![&mod_x, &charges_repeated]; - println!("Encoder26aaModChargeCnnLstmAttnSum::forward - additional_tensors forward time: {:.3?}", start_time.elapsed()); + log::trace!("Encoder26aaModChargeCnnLstmAttnSum::forward - additional_tensors forward time: {:.3?}", start_time.elapsed()); let start_time = Instant::now(); let x = aa_one_hot(&aa_indices, &additional_tensors) .map_err(|e| candle_core::Error::Msg(e.to_string()))?; - println!("Encoder26aaModChargeCnnLstmAttnSum::forward - aa_one_hot forward time: {:.3?}", start_time.elapsed()); + log::trace!("Encoder26aaModChargeCnnLstmAttnSum::forward - aa_one_hot forward time: {:.3?}", start_time.elapsed()); let start_time = Instant::now(); let x = self.input_cnn.forward(&x)?; - println!("Encoder26aaModChargeCnnLstmAttnSum::forward - input_cnn forward time: {:.3?}", start_time.elapsed()); + log::trace!("Encoder26aaModChargeCnnLstmAttnSum::forward - input_cnn forward time: {:.3?}", start_time.elapsed()); let start_time = Instant::now(); let x = self.input_lstm.forward(&x)?; - println!("Encoder26aaModChargeCnnLstmAttnSum::forward - input_lstm forward time: {:.3?}", start_time.elapsed()); + log::trace!("Encoder26aaModChargeCnnLstmAttnSum::forward - input_lstm forward time: {:.3?}", start_time.elapsed()); let start_time = Instant::now(); let x = self.attn_sum.forward(&x)?; - println!("Encoder26aaModChargeCnnLstmAttnSum::forward - attn_sum forward time: {:.3?}", start_time.elapsed()); + log::trace!("Encoder26aaModChargeCnnLstmAttnSum::forward - attn_sum forward time: {:.3?}", start_time.elapsed()); Ok(x) } } @@ -1106,18 +1126,26 @@ impl Encoder26aaModCnnTransformerAttnSum { let x = aa_one_hot(aa_indices, &additional_tensors) .map_err(|e| candle_core::Error::Msg(e.to_string()))?; log::trace!("[Encoder26aaModCnnTransformerAttnSum::forward] - aa_one_hot forward time: {:.3?}", start_time.elapsed()); + let (mean, min, max) = get_tensor_stats(&x)?; + log::debug!("[Encoder26aaModCnnTransformerAttnSum] one-hot output stats - min: {min}, max: {max}, mean: {mean}"); let start_time = Instant::now(); let x = self.input_cnn.forward(&x)?; log::trace!("[Encoder26aaModCnnTransformerAttnSum::forward] - input_cnn forward time: {:.3?}", start_time.elapsed()); + let (mean, min, max) = get_tensor_stats(&x)?; + log::debug!("[Encoder26aaModCnnTransformerAttnSum] input_cnn output stats - min: {min}, max: {max}, mean: {mean}"); let start_time = Instant::now(); let x = self.input_transformer.forward(&x)?; log::trace!("[Encoder26aaModCnnTransformerAttnSum::forward] - input_transformer forward time: {:.3?}", start_time.elapsed()); + let (mean, min, max) = get_tensor_stats(&x)?; + log::debug!("[Encoder26aaModCnnTransformerAttnSum] input_transformer output stats - min: {min}, max: {max}, mean: {mean}"); let start_time = Instant::now(); let x = self.attn_sum.forward(&x)?; log::trace!("[Encoder26aaModCnnTransformerAttnSum::forward] - attn_sum forward time: {:.3?}", start_time.elapsed()); + let (mean, min, max) = get_tensor_stats(&x)?; + log::debug!("[Encoder26aaModCnnTransformerAttnSum] attn_sum output stats - min: {min}, max: {max}, mean: {mean}"); Ok(x) } diff --git a/crates/redeem-properties/src/building_blocks/nn.rs b/crates/redeem-properties/src/building_blocks/nn.rs index cbab40e..a8343c7 100644 --- a/crates/redeem-properties/src/building_blocks/nn.rs +++ b/crates/redeem-properties/src/building_blocks/nn.rs @@ -1,10 +1,14 @@ use candle_core::{Device, IndexOp, Result, Tensor}; -use candle_nn::{Dropout, LayerNorm, Linear, Module, VarBuilder}; +use candle_nn::init::{FanInOut, NonLinearity, NormalOrUniform}; +use candle_nn::{Dropout, Init, LayerNorm, Linear, Module, VarBuilder}; use candle_transformers::models::bert::{BertEncoder, Config}; use candle_nn::ops::softmax; +use std::env::var; use std::ops::{Deref, DerefMut}; use std::sync::Arc; +use crate::utils::utils::get_tensor_stats; + #[derive(Clone)] pub struct ModuleList { modules: Vec>, @@ -78,6 +82,7 @@ pub struct TransformerEncoder { layers: Vec, pos_encoding: Tensor, dropout: Dropout, + pub model_dim: usize, } impl TransformerEncoder { @@ -105,28 +110,38 @@ impl TransformerEncoder { } let pos_encoding = create_sinusoidal_encoding(max_len, model_dim, device)?; let dropout = Dropout::new(dropout_prob); - Ok(Self { layers, pos_encoding, dropout }) + Ok(Self { layers, pos_encoding, dropout, model_dim }) } pub fn forward_with_mask(&self, x: &Tensor, padding_mask: Option<&Tensor>, training: bool) -> Result { log::trace!("[TransformerEncoder] input x shape: {:?}", x.shape()); - + let (mean, min, max) = get_tensor_stats(x)?; + log::debug!("[TransformerEncoder] input stats: mean={}, min={}, max={}", mean, min, max); let (b, t, _) = x.dims3()?; let pe = self.pos_encoding.i((..t, ..))? .unsqueeze(0)? .broadcast_as((b, t, self.pos_encoding.dim(1)?))?; log::trace!("[TransformerEncoder] positional encoding shape: {:?}", pe.shape()); + let (mean, min, max) = get_tensor_stats(&pe)?; + log::debug!("[TransformerEncoder] positional encoding stats: mean={}, min={}, max={}", mean, min, max); let mut out = x.broadcast_add(&pe)?; + let (mean, min, max) = get_tensor_stats(&out)?; + log::debug!("[TransformerEncoder] after positional encoding stats: mean={}, min={}, max={}", mean, min, max); + out = self.dropout.forward(&out, training)?; log::trace!("[TransformerEncoder] after dropout shape: {:?}", out.shape()); + let (mean, min, max) = get_tensor_stats(&out)?; + log::debug!("[TransformerEncoder] after dropout stats: mean={}, min={}, max={}", mean, min, max); for (i, layer) in self.layers.iter().enumerate() { log::trace!("[TransformerEncoder] applying layer {}", i); out = layer.forward(&out, padding_mask, training)?; log::trace!("[TransformerEncoder] output shape after layer {}: {:?}", i, out.shape()); + let (mean, min, max) = get_tensor_stats(&out)?; + log::debug!("[TransformerEncoder] output stats after layer {}: mean={}, min={}, max={}", i, mean, min, max); } Ok(out) } @@ -153,16 +168,14 @@ impl TransformerEncoderLayer { Ok(Self { self_attn: MultiHeadAttention::new(varbuilder, model_dim, model_dim, num_heads)?, ff: FeedForward::new(varbuilder, model_dim, ff_dim)?, - norm1: { - let weight = varbuilder.get((model_dim,), "norm1.weight")?; - let bias = varbuilder.get((model_dim,), "norm1.bias")?; - LayerNorm::new(weight, bias, 1e-5) - }, - norm2: { - let weight = varbuilder.get((model_dim,), "norm2.weight")?; - let bias = varbuilder.get((model_dim,), "norm2.bias")?; - LayerNorm::new(weight, bias, 1e-5) - }, + norm1: candle_nn::layer_norm( + model_dim, + candle_nn::LayerNormConfig::default(), + varbuilder.pp("norm1"))?, + norm2: candle_nn::layer_norm( + model_dim, + candle_nn::LayerNormConfig::default(), + varbuilder.pp("norm2"))?, dropout1: Dropout::new(dropout_prob), dropout2: Dropout::new(dropout_prob), }) @@ -171,10 +184,24 @@ impl TransformerEncoderLayer { pub fn forward(&self, x: &Tensor, mask: Option<&Tensor>, training: bool) -> Result { log::trace!("[TransformerEncoderLayer] input x shape: {:?}", x.shape()); let attn = self.self_attn.forward(x, mask)?; - let x = self.norm1.forward(&x.broadcast_add(&self.dropout1.forward(&attn, training)?)?)?; + let (mean, min, max) = get_tensor_stats(&attn)?; + log::debug!("[TransformerEncoderLayer] attention stats: mean={}, min={}, max={}", mean, min, max); + let tmp = self.dropout1.forward(&attn, training)?; + let (mean, min, max) = get_tensor_stats(&tmp)?; + log::debug!("[TransformerEncoderLayer] attention after dropout stats: mean={}, min={}, max={}", mean, min, max); + let tmp2 = x.broadcast_add(&tmp)?; + let (mean, min, max) = get_tensor_stats(&tmp2)?; + log::debug!("[TransformerEncoderLayer] after residual connection stats: mean={}, min={}, max={}", mean, min, max); + let x = self.norm1.forward(&tmp2)?; + let (mean, min, max) = get_tensor_stats(&x)?; + log::debug!("[TransformerEncoderLayer] after norm1 stats: mean={}, min={}, max={}", mean, min, max); let ff = self.ff.forward(&x)?; + let (mean, min, max) = get_tensor_stats(&ff)?; + log::debug!("[TransformerEncoderLayer] feedforward stats: mean={}, min={}, max={}", mean, min, max); let result = self.norm2.forward(&x.broadcast_add(&self.dropout2.forward(&ff, training)?)?)?; log::trace!("[TransformerEncoderLayer] output shape: {:?}", result.shape()); + let (mean, min, max) = get_tensor_stats(&result)?; + log::debug!("[TransformerEncoderLayer] output stats: mean={}, min={}, max={}", mean, min, max); Ok(result) } } @@ -199,10 +226,10 @@ impl MultiHeadAttention { ) -> Result { let head_dim = model_dim / num_heads; Ok(Self { - proj_q: linear_from_varbuilder(varbuilder, input_dim, model_dim, "proj_q")?, - proj_k: linear_from_varbuilder(varbuilder, input_dim, model_dim, "proj_k")?, - proj_v: linear_from_varbuilder(varbuilder, input_dim, model_dim, "proj_v")?, - proj_out: linear_from_varbuilder(varbuilder, model_dim, model_dim, "proj_out")?, + proj_q: candle_nn::linear(input_dim, model_dim, varbuilder.pp("proj_q"))?, + proj_k: candle_nn::linear(input_dim, model_dim, varbuilder.pp("proj_k"))?, + proj_v: candle_nn::linear(input_dim, model_dim, varbuilder.pp("proj_v"))?, + proj_out: candle_nn::linear(model_dim, model_dim, varbuilder.pp("proj_out"))?, num_heads, head_dim, }) @@ -216,16 +243,25 @@ impl MultiHeadAttention { .reshape((b, t, self.num_heads, self.head_dim))? .transpose(1, 2)? .contiguous()?; + log::trace!("[MultiHeadAttention] Q shape after projection and transpose: {:?}", q.shape()); + let (mean, min, max) = get_tensor_stats(&q)?; + log::debug!("[MultiHeadAttention] Q stats: mean={}, min={}, max={}", mean, min, max); let k = self.proj_k.forward(x)? .reshape((b, t, self.num_heads, self.head_dim))? .transpose(1, 2)? .contiguous()?; + log::trace!("[MultiHeadAttention] K shape after projection and transpose: {:?}", k.shape()); + let (mean, min, max) = get_tensor_stats(&k)?; + log::debug!("[MultiHeadAttention] K stats: mean={}, min={}, max={}", mean, min, max); let v = self.proj_v.forward(x)? .reshape((b, t, self.num_heads, self.head_dim))? .transpose(1, 2)? .contiguous()?; + log::trace!("[MultiHeadAttention] V shape after projection and transpose: {:?}", v.shape()); + let (mean, min, max) = get_tensor_stats(&v)?; + log::debug!("[MultiHeadAttention] V stats: mean={}, min={}, max={}", mean, min, max); log::trace!("[MultiHeadAttention] Q/K/V shape after projection and transpose: {:?}", q.shape()); @@ -242,6 +278,8 @@ impl MultiHeadAttention { }; log::trace!("[MultiHeadAttention] Attention score shape: {:?}", scores.shape()); + let (mean, min, max) = get_tensor_stats(&scores)?; + log::debug!("[MultiHeadAttention] Attention score stats: mean={}, min={}, max={}", mean, min, max); if let Some(mask) = mask { log::trace!("[MultiHeadAttention] Applying mask"); @@ -263,6 +301,8 @@ impl MultiHeadAttention { return Err(e.into()); } }; + let (attn_mean, attn_min, attn_max) = get_tensor_stats(&attn)?; + log::debug!("[MultiHeadAttention] Attention stats: mean={}, min={}, max={}", attn_mean, attn_min, attn_max); let context = match attn.matmul(&v) { Ok(ctx) => ctx.transpose(1, 2)?.reshape((b, t, self.num_heads * self.head_dim))?, @@ -273,6 +313,8 @@ impl MultiHeadAttention { }; log::trace!("[MultiHeadAttention] Final context shape: {:?}", context.shape()); + let (mean, min, max) = get_tensor_stats(&context)?; + log::debug!("[MultiHeadAttention] Context stats: mean={}, min={}, max={}", mean, min, max); self.proj_out.forward(&context) } } @@ -286,8 +328,8 @@ pub struct FeedForward { impl FeedForward { pub fn new(varbuilder: &VarBuilder, model_dim: usize, ff_dim: usize) -> Result { Ok(Self { - lin1: linear_from_varbuilder(varbuilder, model_dim, ff_dim, "lin1")?, - lin2: linear_from_varbuilder(varbuilder, ff_dim, model_dim, "lin2")?, + lin1: candle_nn::linear(model_dim, ff_dim, varbuilder.pp("lin1"))?, + lin2: candle_nn::linear(ff_dim, model_dim, varbuilder.pp("lin2"))?, }) } @@ -297,17 +339,6 @@ impl FeedForward { } } -fn linear_from_varbuilder( - vb: &VarBuilder, - in_dim: usize, - out_dim: usize, - prefix: &str, -) -> Result { - let weight = vb.get((out_dim, in_dim), &format!("{}.weight", prefix))?; - let bias = vb.get((out_dim,), &format!("{}.bias", prefix)).ok(); - Ok(Linear::new(weight, bias)) -} - /// Generate sinusoidal positional encoding like in "Attention is All You Need". pub fn create_sinusoidal_encoding(seq_len: usize, model_dim: usize, device: &Device) -> Result { let mut pe = vec![0f32; seq_len * model_dim]; diff --git a/crates/redeem-properties/src/models/ccs_cnn_lstm_model.rs b/crates/redeem-properties/src/models/ccs_cnn_lstm_model.rs index 463c126..fa8489a 100644 --- a/crates/redeem-properties/src/models/ccs_cnn_lstm_model.rs +++ b/crates/redeem-properties/src/models/ccs_cnn_lstm_model.rs @@ -66,7 +66,7 @@ impl ModelInterface for CCSCNNLSTMModel { /// Create a new CCSCNNLSTMModel instance model from the given model and constants files. fn new>( model_path: P, - constants_path: P, + constants_path: Option

, fixed_sequence_len: usize, num_frag_types: usize, num_modloss_types: usize, @@ -80,8 +80,10 @@ impl ModelInterface for CCSCNNLSTMModel { let var_store = candle_nn::VarBuilder::from_varmap(&varmap, DType::F32, &device); - let constants: ModelConstants = - parse_model_constants(constants_path.as_ref().to_str().unwrap())?; + let constants = match constants_path { + Some(path) => parse_model_constants(path.as_ref().to_str().unwrap())?, + None => ModelConstants::default(), + }; // Load the mod_to_feature mapping let mod_to_feature = load_mod_to_feature(&constants)?; @@ -295,7 +297,7 @@ mod tests { let constants_path = PathBuf::from("data/models/alphapeptdeep/generic/ccs.pth.model_const.yaml"); let device = Device::Cpu; - let model = CCSCNNLSTMModel::new(model_path, constants_path, 0, 8, 4, true, device).unwrap(); + let model = CCSCNNLSTMModel::new(model_path, Some(constants_path), 0, 8, 4, true, device).unwrap(); println!("{:?}", model); } @@ -306,7 +308,7 @@ mod tests { let constants_path = PathBuf::from("data/models/alphapeptdeep/generic/ccs.pth.model_const.yaml"); let device = Device::Cpu; - let model = CCSCNNLSTMModel::new(model_path, constants_path, 0, 8, 4, true, device).unwrap(); + let model = CCSCNNLSTMModel::new(model_path, Some(constants_path), 0, 8, 4, true, device).unwrap(); let peptide_sequences = "AGHCEWQMKYR"; let mods = "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M"; @@ -331,7 +333,7 @@ mod tests { let constants_path = PathBuf::from("data/models/alphapeptdeep/generic/ccs.pth.model_const.yaml"); let device = Device::Cpu; - let model = CCSCNNLSTMModel::new(model_path, constants_path, 0, 8, 4, true, device).unwrap(); + let model = CCSCNNLSTMModel::new(model_path, Some(constants_path), 0, 8, 4, true, device).unwrap(); let peptide_sequences = vec!["AGHCEWQMKYR".to_string(), "AGHCEWQMKYR".to_string()]; let mods = vec!["Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M".to_string(), "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M".to_string()]; diff --git a/crates/redeem-properties/src/models/ccs_model.rs b/crates/redeem-properties/src/models/ccs_model.rs index 4921bb4..c6b719a 100644 --- a/crates/redeem-properties/src/models/ccs_model.rs +++ b/crates/redeem-properties/src/models/ccs_model.rs @@ -32,7 +32,7 @@ impl Clone for CCSModelWrapper { impl CCSModelWrapper { pub fn new>(model_path: P, constants_path: P, arch: &str, device: Device) -> Result { let model: Box = match arch { - "ccs_cnn_lstm" => Box::new(CCSCNNLSTMModel::new(model_path, constants_path, 0, 8, 4, true, device)?), + "ccs_cnn_lstm" => Box::new(CCSCNNLSTMModel::new(model_path, Some(constants_path), 0, 8, 4, true, device)?), // Add other cases here as you implement more models _ => return Err(anyhow!("Unsupported CCS model architecture: {}", arch)), }; diff --git a/crates/redeem-properties/src/models/model_interface.rs b/crates/redeem-properties/src/models/model_interface.rs index 1817681..120dd48 100644 --- a/crates/redeem-properties/src/models/model_interface.rs +++ b/crates/redeem-properties/src/models/model_interface.rs @@ -15,7 +15,7 @@ use candle_core::{DType, Device, Tensor, Var}; use candle_nn::{Optimizer, VarMap}; use log::info; use rayon::prelude::*; -use std::ops::Index; +use std::ops::{Deref, Index}; use std::path::Path; use std::{collections::HashMap, path::PathBuf}; @@ -210,9 +210,14 @@ pub trait ModelInterface: Send + Sync + ModelClone { Self: Sized; /// Create a new instance of the model (given a pretrained model (.pth or .safetensors and constants file). + /// + /// # Arguments + /// * `model_path` - Path to the model file (.pth or .safetensors). + /// * `constants_path` - Optional path to the model constants file (.yaml). If none, will use the default constants. + /// fn new>( model_path: P, - constants_path: P, + constants_path: Option

, fixed_sequence_len: usize, num_frag_types: usize, num_modloss_types: usize, @@ -225,7 +230,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { /// Forward pass through the model. fn forward(&self, input: &Tensor) -> Result; - /// Predict the retention times for a peptide sequence. + /// Predict the property for a batch of peptide sequences. /// /// # Arguments /// * `peptide_sequences` - A vector of peptide sequences. @@ -248,7 +253,8 @@ pub trait ModelInterface: Send + Sync + ModelClone { ) -> Result { // Encode the batch of peptides let input_tensor = - self.encode_peptides(peptide_sequences, mods, mod_sites, charge, nce, instrument)?; + self.encode_peptides(peptide_sequences, mods, mod_sites, charge, nce, instrument)? + .to_device(self.get_device())?; // Forward pass through the model let output = self.forward(&input_tensor)?; @@ -403,8 +409,27 @@ pub trait ModelInterface: Send + Sync + ModelClone { /// Train the model from scratch using a batch of training data. /// - /// This method is similar to `fine_tune`, but assumes that the model was created from `new_untrained` - /// and has no pre-existing learned weights. + /// This method initializes model weights from scratch and trains over the given peptide feature data for a specified + /// number of epochs. Optionally performs validation and tracks both training and validation loss statistics. + /// Early stopping is applied if the validation loss does not improve for a consecutive number of epochs. + /// + /// # Arguments + /// * `training_data` - Vector of peptide records used for training. + /// * `validation_data` - Optional vector of peptide records used for validation at the end of each epoch. + /// * `modifications` - A map of known modifications to encode modified peptides. + /// * `batch_size` - Batch size used for training. + /// * `validation_batch_size` - Batch size used during validation. + /// * `learning_rate` - Learning rate for the AdamW optimizer. + /// * `epochs` - Maximum number of training epochs. + /// * `early_stopping_patience` - Number of epochs to wait before stopping if validation loss does not improve. + /// + /// # Returns + /// A `Vec` of tuples where each tuple contains: + /// * `epoch` - Epoch number. + /// * `avg_train_loss` - Average training loss for the epoch. + /// * `avg_val_loss` - Optional average validation loss for the epoch. + /// * `train_std` - Standard deviation of training loss across batches. + /// * `val_std` - Optional standard deviation of validation loss across batches. fn train( &mut self, training_data: &Vec, @@ -414,12 +439,13 @@ pub trait ModelInterface: Send + Sync + ModelClone { crate::utils::peptdeep_utils::ModificationMap, >, batch_size: usize, + validation_batch_size: usize, learning_rate: f64, epochs: usize, early_stopping_patience: usize, - ) -> Result<()> { + ) -> Result, f32, Option)>> { let num_batches = (training_data.len() + batch_size - 1) / batch_size; - + info!( "Training {} model from scratch on {} peptide features ({} batches) for {} epochs", self.get_model_arch(), @@ -427,39 +453,40 @@ pub trait ModelInterface: Send + Sync + ModelClone { num_batches, epochs ); - + let params = candle_nn::ParamsAdamW { lr: learning_rate, ..Default::default() }; let mut opt = candle_nn::AdamW::new(self.get_mut_varmap().all_vars(), params)?; - + let mut best_val_loss = f32::INFINITY; let mut epochs_without_improvement = 0; - + let mut epoch_losses = vec![]; + for epoch in 0..epochs { let progress = Progress::new(num_batches, &format!("[training] Epoch {}: ", epoch)); - let mut total_loss = 0.0; - + let mut batch_losses = vec![]; + training_data .chunks(batch_size) .enumerate() - .try_for_each(|(batch_idx, batch_data)| { + .try_for_each(|(_batch_idx, batch_data)| { let peptides: Vec = batch_data.iter().map(|p| remove_mass_shift(&p.sequence)).collect(); let mods: Vec = batch_data.iter().map(|p| get_modification_string(&p.sequence, &modifications)).collect(); let mod_sites: Vec = batch_data.iter().map(|p| get_modification_indices(&p.sequence)).collect(); - + let charges = batch_data.iter().filter_map(|p| p.charge).collect::>(); let charges = if charges.len() == batch_data.len() { Some(charges) } else { None }; - + let nces = batch_data.iter().filter_map(|p| p.nce).collect::>(); let nces = if nces.len() == batch_data.len() { Some(nces) } else { None }; - + let instruments = batch_data.iter().filter_map(|p| p.instrument.clone()).collect::>(); let instruments = if instruments.len() == batch_data.len() { Some(instruments) } else { None }; - - let input_batch = self.encode_peptides(&peptides, &mods, &mod_sites, charges, nces, instruments)?; - + + let input_batch = self.encode_peptides(&peptides, &mods, &mod_sites, charges, nces, instruments)?.to_device(self.get_device())?; + let batch_targets = match self.property_type() { PropertyType::RT => PredictionResult::RTResult( batch_data.iter().map(|p| p.retention_time.unwrap_or_default()).collect(), @@ -471,52 +498,52 @@ pub trait ModelInterface: Send + Sync + ModelClone { return Err(anyhow::anyhow!("Training from scratch is not yet implemented for MS2")); } }; - + let target_batch = match batch_targets { PredictionResult::RTResult(ref values) | PredictionResult::IMResult(ref values) => { Tensor::new(values.clone(), &self.get_device())? } PredictionResult::MS2Result(_) => unreachable!(), - }; - + }.to_device(self.get_device())?; + let predicted = self.forward(&input_batch)?; let loss = candle_nn::loss::mse(&predicted, &target_batch)?; opt.backward_step(&loss)?; - - total_loss += loss.to_vec0::().unwrap_or(999.0); - progress.update_description(&format!("[training] Epoch {}: Loss: {:.4}", epoch, loss.to_vec0::()?)); + + let loss_val = loss.to_vec0::().unwrap_or(999.0); + batch_losses.push(loss_val); + + progress.update_description(&format!("[training] Epoch {}: Loss: {:.4}", epoch, loss_val)); progress.inc(); - + Ok(()) })?; - - // Optional validation evaluation + + let avg_loss = batch_losses.iter().copied().sum::() / batch_losses.len() as f32; + let std_loss = (batch_losses.iter().map(|l| (l - avg_loss).powi(2)).sum::() / batch_losses.len() as f32).sqrt(); + if let Some(val_data) = validation_data { - let val_batches = (val_data.len() + batch_size - 1) / batch_size; + let val_batches = (val_data.len() + validation_batch_size - 1) / validation_batch_size; use rayon::prelude::*; - - let total_val_loss: f32 = val_data - .par_chunks(batch_size) + + let val_losses: Vec = val_data + .par_chunks(validation_batch_size) .map(|batch_data| { let peptides: Vec = batch_data.iter().map(|p| remove_mass_shift(&p.sequence)).collect(); let mods: Vec = batch_data.iter().map(|p| get_modification_string(&p.sequence, &modifications)).collect(); let mod_sites: Vec = batch_data.iter().map(|p| get_modification_indices(&p.sequence)).collect(); - + let charges = batch_data.iter().filter_map(|p| p.charge).collect::>(); let charges = if charges.len() == batch_data.len() { Some(charges) } else { None }; - + let nces = batch_data.iter().filter_map(|p| p.nce).collect::>(); let nces = if nces.len() == batch_data.len() { Some(nces) } else { None }; - + let instruments = batch_data.iter().filter_map(|p| p.instrument.clone()).collect::>(); let instruments = if instruments.len() == batch_data.len() { Some(instruments) } else { None }; - - let input_val = self.encode_peptides(&peptides, &mods, &mod_sites, charges, nces, instruments); - let input_val = match input_val { - Ok(x) => x, - Err(e) => return Err(e), - }; - + + let input_val = self.encode_peptides(&peptides, &mods, &mod_sites, charges, nces, instruments)?.to_device(self.get_device())?; + let val_targets = match self.property_type() { PropertyType::RT => PredictionResult::RTResult( batch_data.iter().map(|p| p.retention_time.unwrap_or_default()).collect(), @@ -528,47 +555,57 @@ pub trait ModelInterface: Send + Sync + ModelClone { return Err(anyhow::anyhow!("Validation not supported for MS2 yet")); } }; - + let target_val = match val_targets { PredictionResult::RTResult(ref values) | PredictionResult::IMResult(ref values) => { Tensor::new(values.clone(), &self.get_device())? } PredictionResult::MS2Result(_) => unreachable!(), - }; - + }.to_device(self.get_device())?; + let predicted = self.forward(&input_val)?; let val_loss = candle_nn::loss::mse(&predicted, &target_val)?; Ok(val_loss.to_vec0::()?) }) - .collect::>>()? - .into_iter() - .sum(); - - let avg_val_loss = total_val_loss / val_batches as f32; - let avg_loss = total_loss / num_batches as f32; - - progress.update_description(&format!("Epoch {}: Avg. Train Loss: {:.4} | Avg. Val. Loss: {:.4}", epoch, avg_loss, avg_val_loss)); + .collect::>>()?; + + let avg_val_loss = val_losses.iter().sum::() / val_losses.len() as f32; + let std_val_loss = (val_losses.iter().map(|l| (l - avg_val_loss).powi(2)).sum::() / val_losses.len() as f32).sqrt(); + + epoch_losses.push((epoch, avg_loss, Some(avg_val_loss), std_loss, Some(std_val_loss))); + + progress.update_description(&format!( + "Epoch {}: Avg. Train Loss: {:.4} (±{:.4}) | Avg. Val. Loss: {:.4} (±{:.4})", + epoch, avg_loss, std_loss, avg_val_loss, std_val_loss + )); progress.finish(); - + if avg_val_loss < best_val_loss { best_val_loss = avg_val_loss; epochs_without_improvement = 0; + + let checkpoint_path = format!("redeem_{}_ckpt_model_epoch_{}.safetensors", self.get_model_arch(), epoch); + self.get_mut_varmap().save(&checkpoint_path)?; } else { epochs_without_improvement += 1; if epochs_without_improvement >= early_stopping_patience { info!("Early stopping triggered after {} epochs without validation loss improvement.", early_stopping_patience); - break; + return Ok(epoch_losses); } } } else { - let avg_loss = total_loss / num_batches as f32; - progress.update_description(&format!("Epoch {}: Avg. Train Loss: {:.4}", epoch, avg_loss)); + epoch_losses.push((epoch, avg_loss, None, std_loss, None)); + progress.update_description(&format!("Epoch {}: Avg. Train Loss: {:.4} (±{:.4})", epoch, avg_loss, std_loss)); progress.finish(); + + let checkpoint_path = format!("redeem_{}_ckpt_model_epoch_{}.safetensors", self.get_model_arch(), epoch); + self.get_mut_varmap().save(&checkpoint_path)?; } } - - Ok(()) + + Ok(epoch_losses) } + /// Fine-tune the model on a batch of training data. /// @@ -664,7 +701,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { }; let input_batch = - self.encode_peptides(&peptides, &mods, &mod_sites, charges, nces, instruments)?; + self.encode_peptides(&peptides, &mods, &mod_sites, charges, nces, instruments)?.to_device(self.get_device())?; log::trace!( "[ModelInterface::fine_tune] input_batch shape: {:?}, device: {:?}", @@ -715,7 +752,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { feature_dim, ))? } - }; + }.to_device(self.get_device())?; let predicted = self.forward(&input_batch)?; let loss = candle_nn::loss::mse(&predicted, &target_batch)?; @@ -742,6 +779,89 @@ pub trait ModelInterface: Send + Sync + ModelClone { Ok(()) } + fn inference( + &self, + inference_data: &Vec, + batch_size: usize, + modifications: HashMap< + (String, Option), + crate::utils::peptdeep_utils::ModificationMap, + >, + rt_norm_params: Option<(f32, f32)>, + ) -> Result> { + let num_batches = (inference_data.len() + batch_size - 1) / batch_size; + info!( + "Performing inference on {} peptide features ({} batches)", + inference_data.len(), + num_batches + ); + + let progress = Progress::new(inference_data.len(), "[inference] Batch:"); + let mut result: Vec> = vec![None; inference_data.len()]; + + inference_data + .par_chunks(batch_size) + .enumerate() + .map(|(batch_idx, batch_data)| { + let start_idx = batch_idx * batch_size; + + let peptides: Vec = batch_data.iter().map(|p| remove_mass_shift(&p.sequence)).collect(); + let mods: Vec = batch_data.iter().map(|p| get_modification_string(&p.sequence, &modifications)).collect(); + let mod_sites: Vec = batch_data.iter().map(|p| get_modification_indices(&p.sequence)).collect(); + + let charges = batch_data.iter().filter_map(|p| p.charge).collect::>(); + let charges = if charges.len() == batch_data.len() { Some(charges) } else { None }; + + let nces = batch_data.iter().filter_map(|p| p.nce).collect::>(); + let nces = if nces.len() == batch_data.len() { Some(nces) } else { None }; + + let instruments = batch_data.iter().filter_map(|p| p.instrument.clone()).collect::>(); + let instruments = if instruments.len() == batch_data.len() { Some(instruments) } else { None }; + + let input_tensor = self.encode_peptides(&peptides, &mods, &mod_sites, charges, nces, instruments)?.to_device(self.get_device())?; + let output = self.forward(&input_tensor)?; + + match self.property_type() { + PropertyType::RT | PropertyType::CCS => { + let predictions = output.to_vec1()?; + let updated: Vec<(usize, PeptideData)> = predictions + .into_iter() + .enumerate() + .map(|(i, pred)| { + let mut peptide = batch_data[i].clone(); + match self.property_type() { + PropertyType::RT => { + peptide.retention_time = if let Some((mean, std)) = rt_norm_params { + Some(pred * std + mean) + } else { + Some(pred) + }; + } + PropertyType::CCS => peptide.ion_mobility = Some(pred), + _ => {} + }; + (start_idx + i, peptide) + }) + .collect(); + Ok(updated) + } + PropertyType::MS2 => Err(anyhow::anyhow!("Inference not supported for MS2 models in batch mode")), + } + }) + .collect::>>>()? + .into_iter() + .flatten() + .for_each(|(idx, peptide)| { + result[idx] = Some(peptide); + progress.inc(); + }); + + progress.finish(); + Ok(result.into_iter().flatten().collect()) + } + + + /// Set model to evaluation mode for inference /// This disables dropout and other training-specific layers. fn set_evaluation_mode(&mut self); diff --git a/crates/redeem-properties/src/models/ms2_bert_model.rs b/crates/redeem-properties/src/models/ms2_bert_model.rs index 42b4d57..32a634b 100644 --- a/crates/redeem-properties/src/models/ms2_bert_model.rs +++ b/crates/redeem-properties/src/models/ms2_bert_model.rs @@ -68,7 +68,7 @@ impl ModelInterface for MS2BertModel { /// Create a new MS2BERT model from the given model and constants files. fn new>( model_path: P, - constants_path: P, + constants_path: Option

, fixed_sequence_len: usize, num_frag_types: usize, num_modloss_types: usize, @@ -82,8 +82,10 @@ impl ModelInterface for MS2BertModel { let var_store = VarBuilder::from_varmap(&varmap, DType::F32, &device); - let constants: ModelConstants = - parse_model_constants(constants_path.as_ref().to_str().unwrap())?; + let constants = match constants_path { + Some(path) => parse_model_constants(path.as_ref().to_str().unwrap())?, + None => ModelConstants::default(), + }; // Load the mod_to_feature mapping let mod_to_feature = load_mod_to_feature(&constants)?; @@ -459,7 +461,7 @@ mod tests { let constants_path = PathBuf::from("data/models/alphapeptdeep/generic/ms2.pth.model_const.yaml"); let device = Device::Cpu; - let model = MS2BertModel::new(model_path, constants_path, 0, 8, 4, true, device).unwrap(); + let model = MS2BertModel::new(model_path, Some(constants_path), 0, 8, 4, true, device).unwrap(); println!("{:?}", model); } @@ -470,7 +472,7 @@ mod tests { let constants_path = PathBuf::from("data/models/alphapeptdeep/generic/ms2.pth.model_const.yaml"); let device = Device::Cpu; - let model = MS2BertModel::new(model_path, constants_path, 0, 8, 4, true, device).unwrap(); + let model = MS2BertModel::new(model_path, Some(constants_path), 0, 8, 4, true, device).unwrap(); let peptide_sequences = "AGHCEWQMKYR"; let mods = "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M"; @@ -495,7 +497,7 @@ mod tests { let constants_path = PathBuf::from("data/models/alphapeptdeep/generic/ms2.pth.model_const.yaml"); let device = Device::Cpu; - let model = MS2BertModel::new(model_path, constants_path, 0, 8, 4, true, device).unwrap(); + let model = MS2BertModel::new(model_path, Some(constants_path), 0, 8, 4, true, device).unwrap(); let peptide_sequences = vec!["AGHCEWQMKYR".to_string(), "AGHCEWQMKYR".to_string()]; let mods = vec![ diff --git a/crates/redeem-properties/src/models/ms2_model.rs b/crates/redeem-properties/src/models/ms2_model.rs index f4ed7e1..ea3c489 100644 --- a/crates/redeem-properties/src/models/ms2_model.rs +++ b/crates/redeem-properties/src/models/ms2_model.rs @@ -32,7 +32,7 @@ impl Clone for MS2ModelWrapper { impl MS2ModelWrapper { pub fn new>(model_path: P, constants_path: P, arch: &str, device: Device) -> Result { let model: Box = match arch { - "ms2_bert" => Box::new(MS2BertModel::new(model_path, constants_path, 0, 8, 4, true, device)?), + "ms2_bert" => Box::new(MS2BertModel::new(model_path, Some(constants_path), 0, 8, 4, true, device)?), // Add other cases here as you implement more models _ => return Err(anyhow!("Unsupported MS2 model architecture: {}", arch)), }; diff --git a/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs b/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs index e73f6f4..1cb99c7 100644 --- a/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs +++ b/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs @@ -14,6 +14,7 @@ use crate::utils::peptdeep_utils::{ load_mod_to_feature, parse_model_constants, ModelConstants, }; +use crate::utils::utils::get_tensor_stats; // Main Model Struct @@ -55,7 +56,7 @@ impl ModelInterface for RTCNNLSTMModel { /// Create a new RTCNNLSTMModel from the given model and constants files. fn new>( model_path: P, - constants_path: P, + constants_path: Option

, _fixed_sequence_len: usize, _num_frag_types: usize, _num_modloss_types: usize, @@ -69,8 +70,10 @@ impl ModelInterface for RTCNNLSTMModel { create_var_map(&mut varmap, tensor_data, &device)?; let var_store = candle_nn::VarBuilder::from_varmap(&varmap, DType::F32, &device); - let constants: ModelConstants = - parse_model_constants(constants_path.as_ref().to_str().unwrap())?; + let constants = match constants_path { + Some(path) => parse_model_constants(path.as_ref().to_str().unwrap())?, + None => ModelConstants::default(), + }; // Load the mod_to_feature mapping let mod_to_feature = load_mod_to_feature(&constants)?; @@ -118,10 +121,20 @@ impl ModelInterface for RTCNNLSTMModel { let (_batch_size, _seq_len, _) = xs.shape().dims3()?; let aa_indices_out = xs.i((.., .., 0))?; + let (mean, min, max) = get_tensor_stats(&aa_indices_out)?; + log::debug!("[RTCNNLSTMModel] aa_indices_out stats - min: {min}, max: {max}, mean: {mean}"); let mod_x_out = xs.i((.., .., 1..1 + MOD_FEATURE_SIZE))?; + let (mean, min, max) = get_tensor_stats(&mod_x_out)?; + log::debug!("[RTCNNLSTMModel] mod_x_out stats - min: {min}, max: {max}, mean: {mean}"); let x = self.rt_encoder.forward(&aa_indices_out, &mod_x_out)?; + let (mean, min, max) = get_tensor_stats(&x)?; + log::debug!("[RTCNNLSTMModel] x stats - min: {min}, max: {max}, mean: {mean}"); let x = self.dropout.forward(&x, self.is_training)?; + let (mean, min, max) = get_tensor_stats(&x)?; + log::debug!("[RTCNNLSTMModel] x after dropout stats - min: {min}, max: {max}, mean: {mean}"); let x = self.rt_decoder.forward(&x)?; + let (mean, min, max) = get_tensor_stats(&x)?; + log::debug!("[RTCNNLSTMModel] x after decoder stats - min: {min}, max: {max}, mean: {mean}"); let result = x.squeeze(1)?; Ok(result) @@ -310,7 +323,7 @@ mod tests { let constants_path = PathBuf::from("data/models/alphapeptdeep/generic/rt.pth.model_const.yaml"); let device = Device::Cpu; - let model = RTCNNLSTMModel::new(&model_path, &constants_path, 0, 8, 4, true, device).unwrap(); + let model = RTCNNLSTMModel::new(&model_path, Some(&constants_path), 0, 8, 4, true, device).unwrap(); let peptide_sequences = "AGHCEWQMKYR"; let mods = "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M"; @@ -336,7 +349,7 @@ mod tests { let constants_path = PathBuf::from("data/models/alphapeptdeep/generic/rt.pth.model_const.yaml"); let device = Device::Cpu; - let model = RTCNNLSTMModel::new(&model_path, &constants_path, 0, 8, 4, true, device.clone()).unwrap(); + let model = RTCNNLSTMModel::new(&model_path, Some(&constants_path), 0, 8, 4, true, device.clone()).unwrap(); // Batched input let peptide_sequences = vec![ @@ -383,7 +396,7 @@ mod tests { let constants_path = PathBuf::from("data/models/alphapeptdeep/generic/rt.pth.model_const.yaml"); let device = /* Assuming Device is defined */ Device::new_cuda(0).unwrap_or(/* assuming Device::Cpu is defined */ Device::Cpu); // Replace with actual Device code. - let result = RTCNNLSTMModel::new(&model_path, &constants_path, 0, 8, 4, true, device); + let result = RTCNNLSTMModel::new(&model_path, Some(&constants_path), 0, 8, 4, true, device); let mut model = result.unwrap(); // Test prediction with a few peptides after fine-tuning diff --git a/crates/redeem-properties/src/models/rt_cnn_transformer_model.rs b/crates/redeem-properties/src/models/rt_cnn_transformer_model.rs index 789c127..6008137 100644 --- a/crates/redeem-properties/src/models/rt_cnn_transformer_model.rs +++ b/crates/redeem-properties/src/models/rt_cnn_transformer_model.rs @@ -15,6 +15,7 @@ use crate::utils::peptdeep_utils::{ load_mod_to_feature, parse_model_constants, ModelConstants, }; +use crate::utils::utils::get_tensor_stats; // Main Model Struct @@ -52,9 +53,9 @@ impl ModelInterface for RTCNNTFModel { let mut varmap = VarMap::new(); let varbuilder = VarBuilder::from_varmap(&varmap, DType::F32, &device); - + log::trace!("[RTCNNTFModel] Initializing rt_encoder"); let rt_encoder = Encoder26aaModCnnTransformerAttnSum::new( - &varbuilder, + &varbuilder.pp("rt_encoder"), 8, // mod_hidden_dim 140, // hidden_dim 256, // ff_dim @@ -65,12 +66,13 @@ impl ModelInterface for RTCNNTFModel { &device )?; - let rt_decoder = DecoderLinear::new(140, 1, &varbuilder)?; + log::trace!("[RTCNNTFModel] Initializing rt_decoder"); + let rt_decoder = DecoderLinear::new(140, 1, &varbuilder.pp("rt_decoder"))?; let constants = ModelConstants::default(); let mod_to_feature = load_mod_to_feature(&constants)?; Ok(Self { - var_store: VarBuilder::from_varmap(&varmap, DType::F32, &device), + var_store: varbuilder, varmap, constants, device, @@ -85,7 +87,7 @@ impl ModelInterface for RTCNNTFModel { /// Create a new RTCNNTFModel from the given model and constants files. fn new>( model_path: P, - constants_path: P, + constants_path: Option

, _fixed_sequence_len: usize, _num_frag_types: usize, _num_modloss_types: usize, @@ -97,8 +99,10 @@ impl ModelInterface for RTCNNTFModel { create_var_map(&mut varmap, tensor_data, &device)?; let var_store = candle_nn::VarBuilder::from_varmap(&varmap, DType::F32, &device); - let constants: ModelConstants = - parse_model_constants(constants_path.as_ref().to_str().unwrap())?; + let constants = match constants_path { + Some(path) => parse_model_constants(path.as_ref().to_str().unwrap())?, + None => ModelConstants::default(), + }; let mod_to_feature = load_mod_to_feature(&constants)?; let dropout = Dropout::new(0.1); @@ -146,20 +150,42 @@ impl ModelInterface for RTCNNTFModel { dropout, rt_encoder, rt_decoder, - is_training: true, + is_training: false, }) } fn forward(&self, xs: &Tensor) -> Result { let aa_indices_out = xs.i((.., .., 0))?; - let mod_x_out = xs.i((.., .., 1..1 + MOD_FEATURE_SIZE))?; + let (mean, min, max) = get_tensor_stats(&aa_indices_out)?; + log::debug!("[RTCNNTFModel] aa_indices_out stats - min: {min}, max: {max}, mean: {mean}"); + let mod_x_out = xs.i((.., .., 1..1 + MOD_FEATURE_SIZE))?; + + if mod_x_out.shape().elem_count() == 0 { + log::error!("[RTCNNTFModel] mod_x_out is empty! shape: {:?}", mod_x_out.shape()); + } else { + match get_tensor_stats(&mod_x_out) { + Ok((mean, min, max)) => { + log::debug!("[RTCNNTFModel] mod_x_out stats - min: {min}, max: {max}, mean: {mean}"); + } + Err(e) => { + log::error!("[RTCNNTFModel] Failed to compute stats for mod_x_out: {:?}", e); + } + } + } + log::trace!("[RTCNNTFModel] aa_indices_out: {:?}, mod_x_out: {:?}", aa_indices_out, mod_x_out); let x = self.rt_encoder.forward(&aa_indices_out, &mod_x_out)?; log::trace!("[RTCNNTFModel] x.shape after rt_encoder: {:?}", x.shape()); + let (mean, min, max) = get_tensor_stats(&x)?; + log::debug!("[RTCNNTFModel] rt_encoder output stats - min: {min}, max: {max}, mean: {mean}"); let x = self.dropout.forward(&x, self.is_training)?; log::trace!("[RTCNNTFModel] x.shape after dropout: {:?}", x.shape()); + let (mean, min, max) = get_tensor_stats(&x)?; + log::debug!("[RTCNNTFModel] dropout output stats - min: {min}, max: {max}, mean: {mean}"); let x = self.rt_decoder.forward(&x)?; log::trace!("[RTCNNTFModel] x.shape after rt_decoder: {:?}", x.shape()); + let (mean, min, max) = get_tensor_stats(&x)?; + log::debug!("[RTCNNTFModel] rt_decoder output stats - min: {min}, max: {max}, mean: {mean}"); Ok(x.squeeze(1)?) } @@ -261,7 +287,7 @@ mod tests { let constants_path = PathBuf::from("data/models/alphapeptdeep/generic/rt.pth.model_const.yaml"); let device = Device::Cpu; - let model = RTCNNLSTMModel::new(&model_path, &constants_path, 0, 8, 4, true, device).unwrap(); + let model = RTCNNLSTMModel::new(&model_path, Some(&constants_path), 0, 8, 4, true, device).unwrap(); let peptide_sequences = "AGHCEWQMKYR"; let mods = "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M"; @@ -287,7 +313,7 @@ mod tests { let constants_path = PathBuf::from("data/models/alphapeptdeep/generic/rt.pth.model_const.yaml"); let device = Device::Cpu; - let model = RTCNNLSTMModel::new(&model_path, &constants_path, 0, 8, 4, true, device.clone()).unwrap(); + let model = RTCNNLSTMModel::new(&model_path, Some(&constants_path), 0, 8, 4, true, device.clone()).unwrap(); // Batched input let peptide_sequences = vec![ diff --git a/crates/redeem-properties/src/models/rt_model.rs b/crates/redeem-properties/src/models/rt_model.rs index dd9bcab..f4a9643 100644 --- a/crates/redeem-properties/src/models/rt_model.rs +++ b/crates/redeem-properties/src/models/rt_model.rs @@ -1,10 +1,13 @@ // rt_model.rs use std::path::Path; +use std::ops::Deref; use candle_core::{Device, Tensor}; use anyhow::{Result, anyhow}; +use candle_nn::VarMap; use crate::models::model_interface::{ModelInterface,PredictionResult}; use crate::models::rt_cnn_lstm_model::RTCNNLSTMModel; +use crate::models::rt_cnn_transformer_model::RTCNNTFModel; use crate::utils::data_handling::PeptideData; use std::collections::HashMap; use crate::utils::peptdeep_utils::ModificationMap; @@ -33,10 +36,10 @@ impl Clone for RTModelWrapper { impl RTModelWrapper { - pub fn new>(model_path: P, constants_path: P, arch: &str, device: Device) -> Result { + pub fn new>(model_path: P, constants_path: Option

, arch: &str, device: Device) -> Result { let model: Box = match arch { "rt_cnn_lstm" => Box::new(RTCNNLSTMModel::new(model_path, constants_path, 0, 8, 4, true, device)?), - // Add other cases here as you implement more models + "rt_cnn_tf" => Box::new(RTCNNTFModel::new(model_path, constants_path, 0, 8, 4, true, device)?), _ => return Err(anyhow!("Unsupported RT model architecture: {}", arch)), }; @@ -47,10 +50,18 @@ impl RTModelWrapper { self.model.predict(peptide_sequence, mods, mod_sites, None, None, None) } + pub fn train(&mut self, training_data: &Vec, val_data: Option<&Vec>, modifications: HashMap<(String, Option), ModificationMap>, batch_size: usize, val_batch_size: usize, learning_rate: f64, epochs: usize, early_stopping_patience: usize) -> Result, f32, Option)>> { + self.model.train(training_data, val_data, modifications, batch_size, val_batch_size, learning_rate, epochs, early_stopping_patience) + } + pub fn fine_tune(&mut self, training_data: &Vec, modifications: HashMap<(String, Option), ModificationMap>, batch_size:usize, learning_rate: f64, epochs: usize) -> Result<()> { self.model.fine_tune(training_data, modifications, batch_size, learning_rate, epochs) } + pub fn inference(&mut self, inference_data: &Vec, batch_size: usize, modifications: HashMap<(String, Option), ModificationMap>, rt_norm_params: Option<(f32, f32)>,) -> Result> { + self.model.inference(inference_data, batch_size, modifications, rt_norm_params) + } + pub fn set_evaluation_mode(&mut self) { self.model.set_evaluation_mode() } @@ -73,7 +84,7 @@ impl RTModelWrapper { } // Public API Function to load a new RT model -pub fn load_retention_time_model>(model_path: P, constants_path: P, arch: &str, device: Device) -> Result { +pub fn load_retention_time_model>(model_path: P, constants_path: Option

, arch: &str, device: Device) -> Result { RTModelWrapper::new(model_path, constants_path, arch, device) } diff --git a/crates/redeem-properties/src/utils/data_handling.rs b/crates/redeem-properties/src/utils/data_handling.rs index 84f26d0..11520ce 100644 --- a/crates/redeem-properties/src/utils/data_handling.rs +++ b/crates/redeem-properties/src/utils/data_handling.rs @@ -1,5 +1,5 @@ - +#[derive(Clone)] pub struct PeptideData { pub sequence: String, pub charge: Option, diff --git a/crates/redeem-properties/src/utils/logging.rs b/crates/redeem-properties/src/utils/logging.rs index 29c3f0f..9b6d322 100644 --- a/crates/redeem-properties/src/utils/logging.rs +++ b/crates/redeem-properties/src/utils/logging.rs @@ -86,7 +86,7 @@ impl Progress { let new_count = self.count.fetch_add(1, Ordering::AcqRel) + 1; if new_count > self.total { - println!("⚠️ WARNING: Extra update detected! Skipping..."); + log::trace!("⚠️ WARNING: Progress logger received and extra update! This is likely because the logger was initialized with an incorrect total counter, and the process is iterating beyond that counter."); return; // Prevent overflow } diff --git a/crates/redeem-properties/src/utils/peptdeep_utils.rs b/crates/redeem-properties/src/utils/peptdeep_utils.rs index 8b274b5..f477527 100644 --- a/crates/redeem-properties/src/utils/peptdeep_utils.rs +++ b/crates/redeem-properties/src/utils/peptdeep_utils.rs @@ -123,6 +123,7 @@ impl Default for ModelConstants { "Lumos".into(), "timsTOF".into(), "SciexTOF".into(), + "ThermoTOF".into(), ], max_instrument_num: 8, mod_elements: vec![ diff --git a/crates/redeem-properties/src/utils/utils.rs b/crates/redeem-properties/src/utils/utils.rs index 34572a6..5b21291 100644 --- a/crates/redeem-properties/src/utils/utils.rs +++ b/crates/redeem-properties/src/utils/utils.rs @@ -1,4 +1,4 @@ -use candle_core::Device; +use candle_core::{Device, Tensor}; use candle_core::utils::{cuda_is_available, metal_is_available}; use anyhow::{Result, anyhow}; use std::f64::consts::PI; @@ -150,6 +150,29 @@ pub fn device(cpu: bool) -> Result { } +pub fn get_tensor_stats(x: &Tensor) -> Result<(f32, f32, f32), candle_core::Error> { + // let flat: Vec = match x.rank() { + // 0 => vec![x.to_scalar::()?], + // 1 => x.to_vec1::()?, + // 2 => x.to_vec2::()?.into_iter().flatten().collect(), + // 3 => x.to_vec3::()?.into_iter().flatten().flatten().collect(), + // _ => return Err(candle_core::Error::Msg(format!("Unsupported tensor rank: {}", x.rank()))), + // }; + let flat = x.flatten_all()?.to_vec1::()?; + + if flat.is_empty() { + return Err(candle_core::Error::Msg("Tensor has no elements to compute stats.".to_string())); + } + + let mean = flat.iter().copied().sum::() / flat.len() as f32; + let min = flat.iter().copied().fold(f32::INFINITY, f32::min); + let max = flat.iter().copied().fold(f32::NEG_INFINITY, f32::max); + + Ok((mean, min, max)) +} + + + #[cfg(test)] mod tests { use super::*; From bf62774fe0e032a087742ed23094fe744f26fa0d Mon Sep 17 00:00:00 2001 From: singjc Date: Sat, 10 May 2025 21:44:30 -0400 Subject: [PATCH 23/75] feat: Add inference functionality to redeem-cli --- crates/redeem-cli/src/main.rs | 72 ++++++++++++++++--- .../src/properties/inference/inference.rs | 52 ++++++++++++++ .../src/properties/inference/input.rs | 69 ++++++++++++++++++ .../src/properties/inference/mod.rs | 3 + .../src/properties/inference/output.rs | 46 ++++++++++++ 5 files changed, 233 insertions(+), 9 deletions(-) create mode 100644 crates/redeem-cli/src/properties/inference/inference.rs create mode 100644 crates/redeem-cli/src/properties/inference/input.rs create mode 100644 crates/redeem-cli/src/properties/inference/mod.rs create mode 100644 crates/redeem-cli/src/properties/inference/output.rs diff --git a/crates/redeem-cli/src/main.rs b/crates/redeem-cli/src/main.rs index a6f8874..d9ea4dd 100644 --- a/crates/redeem-cli/src/main.rs +++ b/crates/redeem-cli/src/main.rs @@ -3,8 +3,10 @@ use log::LevelFilter; use std::path::PathBuf; use anyhow::Result; -use redeem_cli::properties::train::input::{self, PropertyTrainConfig}; +use redeem_cli::properties::train::input::PropertyTrainConfig; use redeem_cli::properties::train::trainer; +use redeem_cli::properties::inference::input::PropertyInferenceConfig; +use redeem_cli::properties::inference::inference; fn main() -> Result<()> { env_logger::Builder::default() @@ -79,13 +81,52 @@ fn main() -> Result<()> { "ccs_cnn_lstm", ]) .required(false) - ) - .help_template( - "{usage-heading} {usage}\n\n\ - {about-with-newline}\n\ - Written by {author-with-newline}Version {version}\n\n\ - {all-args}{after-help}", + ) + .arg( + Arg::new("checkpoint_file") + .short('c') + .long("checkpoint_file") + .value_parser(clap::builder::NonEmptyStringValueParser::new()) + .help( + "File path of the checkpoint safetensors file to load. \ + Overrides the checkpoint_file specified in the configuration file.", + ) + .value_hint(ValueHint::FilePath), ), + ) + .subcommand(Command::new("inference") + .about("Perform inference on new data using a trained model") + .arg( + Arg::new("config") + .help("Path to training configuration file") + .required(true) + .value_parser(clap::value_parser!(PathBuf)) + .value_hint(ValueHint::FilePath), + ) + .arg( + Arg::new("model_path") + .short('m') + .long("model") + .help("Path to the trained model file (*.safetensors)") + .value_parser(clap::value_parser!(PathBuf)) + .value_hint(ValueHint::FilePath), + ) + .arg( + Arg::new("inference_data") + .short('d') + .long("inference_data") + .help("Path to the input data file") + .value_parser(clap::value_parser!(PathBuf)) + .value_hint(ValueHint::FilePath), + ) + .arg( + Arg::new("output_file") + .short('o') + .long("output_file") + .help("Path to the output file for predictions (*.tsv or *.csv)") + .value_parser(clap::value_parser!(PathBuf)) + .value_hint(ValueHint::FilePath), + ) ), ) .subcommand( @@ -103,6 +144,12 @@ fn main() -> Result<()> { ), ), ) + .help_template( + "{usage-heading} {usage}\n\n\ + {about-with-newline}\n\ + Written by {author-with-newline}Version {version}\n\n\ + {all-args}{after-help}", + ) .get_matches(); match matches.subcommand() { @@ -116,10 +163,17 @@ fn handle_properties(matches: &ArgMatches) -> Result<()> { match matches.subcommand() { Some(("train", train_matches)) => { let config_path: &PathBuf = train_matches.get_one("config").unwrap(); - println!("[ReDeeM::Properties] Training from config: {:?}", config_path); - let params: PropertyTrainConfig = input::PropertyTrainConfig::from_arguments(config_path, train_matches)?; + log::info!("[ReDeeM::Properties] Training from config: {:?}", config_path); + let params: PropertyTrainConfig = PropertyTrainConfig::from_arguments(config_path, train_matches)?; let _ = trainer::run_training(¶ms); Ok(()) + }, + Some(("inference", inference_matches)) => { + let config_path: &PathBuf = inference_matches.get_one("config").unwrap(); + log::info!("[ReDeeM::Properties] Inference using config: {:?}", config_path); + let params: PropertyInferenceConfig = PropertyInferenceConfig::from_arguments(config_path, inference_matches)?; + let _ = inference:: run_inference(¶ms); + Ok(()) } _ => unreachable!(), } diff --git a/crates/redeem-cli/src/properties/inference/inference.rs b/crates/redeem-cli/src/properties/inference/inference.rs new file mode 100644 index 0000000..876af7e --- /dev/null +++ b/crates/redeem-cli/src/properties/inference/inference.rs @@ -0,0 +1,52 @@ +use anyhow::{Context, Result}; +use redeem_properties::utils::data_handling::PeptideData; +use redeem_properties::utils::peptdeep_utils::load_modifications; +use redeem_properties::utils::utils::get_device; +use redeem_properties::models::rt_model::load_retention_time_model; + +use crate::properties::load_data::load_peptide_data; +use crate::properties::util::write_bytes_to_file; +use crate::properties::inference::input::PropertyInferenceConfig; +use crate::properties::inference::output::write_peptide_data; + +pub fn run_inference(config: &PropertyInferenceConfig) -> Result<()> { + + // Load inference data + let (inference_data, norm_factor) = load_peptide_data(&config.inference_data, Some(config.nce), Some(config.instrument.clone()), true)?; + log::info!("Loaded {} peptides", inference_data.len()); + + // Dispatch model training based on architecture + let model_arch = config.model_arch.as_str(); + let device = get_device(&config.device)?; + + let mut model = load_retention_time_model( + &config.model_path, + None, + &config.model_arch, + device.clone(), + )?; + + let modifications = load_modifications().context("Failed to load modifications")?; + + let start_time = std::time::Instant::now(); + model.set_evaluation_mode(); + let inference_results: Vec = model.inference( + &inference_data, + config.batch_size, + modifications, + norm_factor, + )?; + log::info!("Inference completed in {:?}", start_time.elapsed()); + + + log::info!("Predictions saved to: {}", config.output_file); + write_peptide_data(&inference_results, &config.output_file)?; + + let path = "redeem_inference_config.json"; + let json = serde_json::to_string_pretty(&config)?; + println!("{}", json); + let bytes = serde_json::to_vec_pretty(&config)?; + write_bytes_to_file(path, &bytes)?; + + Ok(()) +} diff --git a/crates/redeem-cli/src/properties/inference/input.rs b/crates/redeem-cli/src/properties/inference/input.rs new file mode 100644 index 0000000..0e6119e --- /dev/null +++ b/crates/redeem-cli/src/properties/inference/input.rs @@ -0,0 +1,69 @@ +use serde::{Deserialize, Serialize}; +use std::fs; +use std::path::PathBuf; +use clap::ArgMatches; +use anyhow::{Context, Result}; + +use crate::properties::util::validate_tsv_or_csv_file; + +#[derive(Debug, Deserialize, Serialize, Clone)] +pub struct PropertyInferenceConfig { + pub model_path: String, + pub inference_data: String, + pub output_file: String, + pub model_arch: String, + pub device: String, + pub batch_size: usize, + pub instrument: String, + pub nce: i32, +} + +impl Default for PropertyInferenceConfig { + fn default() -> Self { + PropertyInferenceConfig { + model_path: String::new(), + inference_data: String::new(), + output_file: String::from("redeem_inference.csv"), + model_arch: String::from("rt_cnn_tf"), + device: String::from("cpu"), + batch_size: 64, + instrument: String::from("QE"), + nce: 20, + } + } +} + +impl PropertyInferenceConfig { + pub fn from_arguments(config_path: &PathBuf, matches: &ArgMatches) -> Result { + let config_json = fs::read_to_string(config_path) + .with_context(|| format!("Failed to read config file: {:?}", config_path))?; + + let mut config: PropertyInferenceConfig = serde_json::from_str(&config_json) + .unwrap_or_else(|_| PropertyInferenceConfig::default()); + + // Apply CLI overrides + if let Some(model_path) = matches.get_one::("model_path") { + config.model_path = model_path.clone(); + } else { + config.model_path = config.model_path.clone(); + } + + if let Some(inference_data) = matches.get_one::("inference_data") { + validate_tsv_or_csv_file(inference_data)?; + config.inference_data = inference_data.clone().to_string(); + } else { + validate_tsv_or_csv_file(&config.inference_data)?; + } + + if let Some(output_file) = matches.get_one::("output_file") { + config.output_file = output_file.clone(); + } + + if let Some(model_arch) = matches.get_one::("model_arch") { + config.model_arch = model_arch.clone(); + } + + Ok(config) + } +} + diff --git a/crates/redeem-cli/src/properties/inference/mod.rs b/crates/redeem-cli/src/properties/inference/mod.rs new file mode 100644 index 0000000..54c03a5 --- /dev/null +++ b/crates/redeem-cli/src/properties/inference/mod.rs @@ -0,0 +1,3 @@ +pub mod inference; +pub mod input; +pub mod output; \ No newline at end of file diff --git a/crates/redeem-cli/src/properties/inference/output.rs b/crates/redeem-cli/src/properties/inference/output.rs new file mode 100644 index 0000000..2e91c97 --- /dev/null +++ b/crates/redeem-cli/src/properties/inference/output.rs @@ -0,0 +1,46 @@ +use std::fs::File; +use std::io::{BufWriter, Write}; +use anyhow::{Result, Context}; +use std::path::Path; +use redeem_properties::utils::data_handling::PeptideData; + +/// Write a vector of PeptideData to a CSV or TSV file based on file extension. +pub fn write_peptide_data>(data: &[PeptideData], output_path: P) -> Result<()> { + let path = output_path.as_ref(); + let extension = path.extension().and_then(|ext| ext.to_str()).unwrap_or("csv"); + let delimiter = match extension { + "tsv" => '\t', + _ => ',', + }; + + let file = File::create(path).with_context(|| format!("Failed to create output file: {:?}", path))?; + let mut writer = csv::WriterBuilder::new() + .delimiter(delimiter as u8) + .from_writer(BufWriter::new(file)); + + // Write headers + writer.write_record(&["sequence", "charge", "nce", "instrument", "retention_time", "ion_mobility", "ms2_intensities"])?; + + for entry in data { + let ms2_str = entry.ms2_intensities.as_ref() + .map(|intensities| { + intensities.iter() + .map(|v| v.iter().map(|f| f.to_string()).collect::>().join(",")) + .collect::>().join("|") + }) + .unwrap_or_default(); + + writer.write_record(&[ + &entry.sequence, + &entry.charge.map_or(String::new(), |c| c.to_string()), + &entry.nce.map_or(String::new(), |n| n.to_string()), + &entry.instrument.clone().unwrap_or_default(), + &entry.retention_time.map_or(String::new(), |r| format!("{:.4}", r)), + &entry.ion_mobility.map_or(String::new(), |im| format!("{:.4}", im)), + &ms2_str, + ])?; + } + + writer.flush()?; + Ok(()) +} \ No newline at end of file From eab57a03e1134b1109e361ce4facadf5e1bdf793 Mon Sep 17 00:00:00 2001 From: singjc Date: Sat, 10 May 2025 21:44:59 -0400 Subject: [PATCH 24/75] refactor: Add new modules for training and loading data in redeem-cli --- crates/redeem-cli/Cargo.toml | 4 + crates/redeem-cli/src/properties/load_data.rs | 68 +++++--- crates/redeem-cli/src/properties/mod.rs | 4 +- .../redeem-cli/src/properties/train/input.rs | 31 ++-- crates/redeem-cli/src/properties/train/mod.rs | 16 +- .../redeem-cli/src/properties/train/plot.rs | 79 ++++++++++ .../src/properties/train/trainer.rs | 145 ++++++++++++++++-- crates/redeem-cli/src/properties/util.rs | 26 ++++ 8 files changed, 319 insertions(+), 54 deletions(-) create mode 100644 crates/redeem-cli/src/properties/train/plot.rs create mode 100644 crates/redeem-cli/src/properties/util.rs diff --git a/crates/redeem-cli/Cargo.toml b/crates/redeem-cli/Cargo.toml index 8e2fbd7..f130987 100644 --- a/crates/redeem-cli/Cargo.toml +++ b/crates/redeem-cli/Cargo.toml @@ -18,6 +18,10 @@ anyhow = "1.0" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" csv = "1.1" +report-builder = "0.1.0" +maud = "0.27.0" +plotly = "0.12.1" +rand = "0.8" [dependencies.candle-core] version = "0.8.4" diff --git a/crates/redeem-cli/src/properties/load_data.rs b/crates/redeem-cli/src/properties/load_data.rs index 7be1cfb..0c423ef 100644 --- a/crates/redeem-cli/src/properties/load_data.rs +++ b/crates/redeem-cli/src/properties/load_data.rs @@ -5,18 +5,17 @@ use anyhow::{Result, Context}; use csv::ReaderBuilder; use redeem_properties::utils::data_handling::PeptideData; -/// Load peptide training data from a CSV or TSV file. +/// Load peptide training data from a CSV or TSV file and optionally normalize RT. /// -/// Automatically determines the delimiter and supports RT models. -/// Currently expects columns: "sequence", "retention time" (others optional). -/// -/// # Arguments -/// * `path` - Path to the input CSV/TSV file -/// -/// # Returns -/// Vector of parsed `PeptideData` records -pub fn load_peptide_data>(path: P) -> Result> { - let file = File::open(&path).with_context(|| format!("Failed to open file: {:?}", path.as_ref()))?; +/// Returns both the peptide vector and optionally (mean, std) of retention times. +pub fn load_peptide_data>( + path: P, + nce: Option, + instrument: Option, + normalize_rt: bool, +) -> Result<(Vec, Option<(f32, f32)>)> { + let file = File::open(&path) + .with_context(|| format!("Failed to open file: {:?}", path.as_ref()))?; let reader = BufReader::new(file); let is_tsv = path.as_ref().extension().map(|e| e == "tsv").unwrap_or(false); @@ -28,8 +27,9 @@ pub fn load_peptide_data>(path: P) -> Result> { .from_reader(reader); let headers = rdr.headers()?.clone(); - let mut peptides = Vec::new(); + let mut rt_values = Vec::new(); + for result in rdr.records() { let record = result?; @@ -46,24 +46,50 @@ pub fn load_peptide_data>(path: P) -> Result> { .get(headers.iter().position(|h| h == "charge").unwrap_or(usize::MAX)) .and_then(|s| s.parse::().ok()); - let nce = record - .get(headers.iter().position(|h| h == "nce").unwrap_or(usize::MAX)) - .and_then(|s| s.parse::().ok()); + let in_nce = nce.or_else(|| { + record + .get(headers.iter().position(|h| h == "nce").unwrap_or(usize::MAX)) + .and_then(|s| s.parse::().ok()) + }); - let instrument = record - .get(headers.iter().position(|h| h == "instrument").unwrap_or(usize::MAX)) - .map(|s| s.to_string()); + let in_instrument = instrument.clone().or_else(|| { + record + .get(headers.iter().position(|h| h == "instrument").unwrap_or(usize::MAX)) + .map(|s| s.to_string()) + }); + + if let Some(rt) = retention_time { + rt_values.push(rt); + } peptides.push(PeptideData::new( &sequence, charge, - nce, - instrument.as_deref(), + in_nce, + in_instrument.as_deref(), retention_time, None, None, )); } - Ok(peptides) + if normalize_rt && !rt_values.is_empty() { + let mean = rt_values.iter().copied().sum::() / rt_values.len() as f32; + let std = (rt_values + .iter() + .map(|v| (v - mean).powi(2)) + .sum::() + / rt_values.len() as f32) + .sqrt(); + + for peptide in &mut peptides { + if let Some(rt) = peptide.retention_time.as_mut() { + *rt = (*rt - mean) / std; + } + } + + Ok((peptides, Some((mean, std)))) + } else { + Ok((peptides, None)) + } } diff --git a/crates/redeem-cli/src/properties/mod.rs b/crates/redeem-cli/src/properties/mod.rs index eb69af4..b53ed37 100644 --- a/crates/redeem-cli/src/properties/mod.rs +++ b/crates/redeem-cli/src/properties/mod.rs @@ -1,2 +1,4 @@ pub mod train; -pub mod load_data; \ No newline at end of file +pub mod inference; +pub mod load_data; +pub mod util; diff --git a/crates/redeem-cli/src/properties/train/input.rs b/crates/redeem-cli/src/properties/train/input.rs index fb5d3f3..58d3fd9 100644 --- a/crates/redeem-cli/src/properties/train/input.rs +++ b/crates/redeem-cli/src/properties/train/input.rs @@ -1,20 +1,25 @@ -use serde::Deserialize; +use serde::{Deserialize, Serialize}; use std::fs; use std::path::PathBuf; use clap::ArgMatches; use anyhow::{Context, Result}; -#[derive(Debug, Deserialize, Clone)] +use crate::properties::util::validate_tsv_or_csv_file; + +#[derive(Debug, Deserialize, Serialize, Clone)] pub struct PropertyTrainConfig { + pub version: String, pub train_data: String, pub validation_data: Option, pub output_file: String, pub model_arch: String, pub device: String, pub batch_size: usize, + pub validation_batch_size: Option, pub learning_rate: f32, pub epochs: usize, pub early_stopping_patience: usize, + pub checkpoint_file: Option, pub instrument: String, pub nce: i32, } @@ -22,15 +27,18 @@ pub struct PropertyTrainConfig { impl Default for PropertyTrainConfig { fn default() -> Self { PropertyTrainConfig { + version: clap::crate_version!().to_string(), train_data: String::new(), validation_data: None, output_file: String::from("rt_cnn_tf.safetensors"), model_arch: String::from("rt_cnn_tf"), device: String::from("cpu"), batch_size: 64, + validation_batch_size: None, learning_rate: 1e-3, epochs: 10, early_stopping_patience: 5, + checkpoint_file: None, instrument: String::from("QE"), nce: 20, } @@ -68,23 +76,12 @@ impl PropertyTrainConfig { config.model_arch = model_arch.clone(); } + if let Some(checkpoint_file) = matches.get_one::("checkpoint_file") { + config.checkpoint_file = Some(checkpoint_file.clone()); + } + Ok(config) } } -pub fn validate_tsv_or_csv_file(path: &str) -> Result<()> { - let pb = PathBuf::from(path); - - let ext = pb.extension().and_then(|s| s.to_str()).map(|s| s.to_lowercase()); - match ext.as_deref() { - Some("tsv") | Some("csv") => {} - _ => anyhow::bail!("File must have a .tsv or .csv extension: {}", path), - } - - if !pb.exists() { - anyhow::bail!("File does not exist: {}", path); - } - - Ok(()) -} diff --git a/crates/redeem-cli/src/properties/train/mod.rs b/crates/redeem-cli/src/properties/train/mod.rs index d60a05a..1b1dd9e 100644 --- a/crates/redeem-cli/src/properties/train/mod.rs +++ b/crates/redeem-cli/src/properties/train/mod.rs @@ -1,2 +1,16 @@ pub mod input; -pub mod trainer; \ No newline at end of file +pub mod trainer; +pub mod plot; + +use rand::seq::SliceRandom; +use rand::thread_rng; +use redeem_properties::utils::data_handling::PeptideData; + +pub fn sample_peptides(peptides: &[PeptideData], n: usize) -> Vec { + let mut rng = thread_rng(); + let sample_size = n.min(peptides.len()); + peptides + .choose_multiple(&mut rng, sample_size) + .cloned() + .collect() +} diff --git a/crates/redeem-cli/src/properties/train/plot.rs b/crates/redeem-cli/src/properties/train/plot.rs new file mode 100644 index 0000000..f5b7b80 --- /dev/null +++ b/crates/redeem-cli/src/properties/train/plot.rs @@ -0,0 +1,79 @@ +use plotly::{Layout, Plot, Scatter}; +use plotly::common::{Fill, Mode, Title}; + +pub fn plot_losses( + epoch_losses: &[(usize, f32, Option, f32, Option)] +) -> Plot { + let epochs: Vec<_> = epoch_losses.iter().map(|(e, _, _, _, _)| *e as f64).collect(); + + let train_mean: Vec<_> = epoch_losses.iter().map(|(_, m, _, _, _)| *m as f64).collect(); + let train_std: Vec<_> = epoch_losses.iter().map(|(_, _, _, std, _)| *std as f64).collect(); + let train_upper: Vec<_> = train_mean.iter().zip(&train_std).map(|(m, s)| m + s).collect(); + let train_lower: Vec<_> = train_mean.iter().zip(&train_std).map(|(m, s)| m - s).collect(); + + let val_mean: Vec<_> = epoch_losses.iter().map(|(_, _, val, _, _)| val.unwrap_or(f32::NAN) as f64).collect(); + let val_std: Vec<_> = epoch_losses.iter().map(|(_, _, _, _, val_std)| val_std.unwrap_or(0.0) as f64).collect(); + let val_upper: Vec<_> = val_mean.iter().zip(&val_std).map(|(m, s)| m + s).collect(); + let val_lower: Vec<_> = val_mean.iter().zip(&val_std).map(|(m, s)| m - s).collect(); + + let mut plot = Plot::new(); + + // Training loss line + plot.add_trace( + Scatter::new(epochs.clone(), train_mean.clone()) + .name("Train Loss") + .mode(Mode::Lines) + .line(plotly::common::Line::new().color("rgba(31, 119, 180, 1.0)")), + ); + + // Training loss band + let mut train_band_y = train_upper.clone(); + let mut train_band_x = epochs.clone(); + let mut lower_reversed: Vec<_> = train_lower.iter().cloned().rev().collect(); + let mut x_reversed: Vec<_> = epochs.iter().cloned().rev().collect(); + train_band_y.extend(lower_reversed); + train_band_x.extend(x_reversed); + + plot.add_trace( + Scatter::new(train_band_x, train_band_y) + .name("Train ± σ") + .mode(Mode::Lines) + .fill(Fill::ToSelf) + .line(plotly::common::Line::new().width(0.0)) + .fill_color("rgba(31, 119, 180, 0.2)") + ); + + // Validation loss line + plot.add_trace( + Scatter::new(epochs.clone(), val_mean.clone()) + .name("Val Loss") + .mode(Mode::Lines) + .line(plotly::common::Line::new().color("rgba(255, 127, 14, 1.0)")), + ); + + // Validation loss band + let mut val_band_y = val_upper.clone(); + let mut val_band_x = epochs.clone(); + let mut val_lower_rev: Vec<_> = val_lower.iter().cloned().rev().collect(); + let mut val_x_rev: Vec<_> = epochs.iter().cloned().rev().collect(); + val_band_y.extend(val_lower_rev); + val_band_x.extend(val_x_rev); + + plot.add_trace( + Scatter::new(val_band_x, val_band_y) + .name("Val ± σ") + .mode(Mode::Lines) + .fill(Fill::ToSelf) + .line(plotly::common::Line::new().width(0.0)) + .fill_color("rgba(255, 127, 14, 0.2)") + ); + + plot.set_layout( + Layout::new() + .title("Training and Validation Loss Over Epochs") + .x_axis(plotly::layout::Axis::new().title("Epoch")) + .y_axis(plotly::layout::Axis::new().title("Loss")) + ); + + plot +} diff --git a/crates/redeem-cli/src/properties/train/trainer.rs b/crates/redeem-cli/src/properties/train/trainer.rs index 2d23994..f16a4bf 100644 --- a/crates/redeem-cli/src/properties/train/trainer.rs +++ b/crates/redeem-cli/src/properties/train/trainer.rs @@ -1,30 +1,40 @@ use anyhow::{Context, Result}; -use input::PropertyTrainConfig; -use load_data::load_peptide_data; use redeem_properties::models::model_interface::ModelInterface; +use redeem_properties::models::rt_model::load_retention_time_model; use redeem_properties::models::{rt_cnn_lstm_model::RTCNNLSTMModel, rt_cnn_transformer_model::RTCNNTFModel}; use redeem_properties::utils::data_handling::PeptideData; use redeem_properties::utils::peptdeep_utils::load_modifications; use redeem_properties::utils::utils::get_device; -use std::path::PathBuf; -use candle_core::Device; +use report_builder::{ + plots::{plot_boxplot, plot_pp, plot_scatter, plot_score_histogram}, + Report, ReportSection, +}; +use maud::{html, PreEscaped}; +use input::PropertyTrainConfig; +use load_data::load_peptide_data; use crate::properties::load_data; +use crate::properties::train::plot::plot_losses; +use crate::properties::train::sample_peptides; +use crate::properties::util::write_bytes_to_file; use super::input; pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { // Load training data - let train_peptides: Vec = load_peptide_data(&config.train_data)?; + let (train_peptides, norm_factor) = load_peptide_data(&config.train_data, Some(config.nce), Some(config.instrument.clone()), true)?; log::info!("Loaded {} training peptides", train_peptides.len()); // Load validation data if specified - let val_peptides = if let Some(ref val_path) = config.validation_data { - Some(load_peptide_data(val_path).context("Failed to load validation data")?) + let (val_peptides, _val_norm_factor) = if let Some(ref val_path) = config.validation_data { + let (peptides, norm) = load_peptide_data(val_path, Some(config.nce), Some(config.instrument.clone()), true) + .context("Failed to load validation data")?; + (Some(peptides), Some(norm)) } else { - None + (None, None) }; + if let Some(ref val_data) = val_peptides { log::info!("Loaded {} validation peptides", val_data.len()); @@ -35,29 +45,136 @@ pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { // Dispatch model training based on architecture let model_arch = config.model_arch.as_str(); let device = get_device(&config.device)?; + log::trace!("Loading model architecture: {} on device: {:?}", model_arch, device); - let mut model: Box = match model_arch { - "rt_cnn_lstm" => Box::new(RTCNNLSTMModel::new_untrained(device.clone())?), - "rt_cnn_tf" => Box::new(RTCNNTFModel::new_untrained(device.clone())?), - _ => return Err(anyhow::anyhow!("Unsupported model architecture: {}", model_arch)), + let mut model: Box = match &config.checkpoint_file { + Some(checkpoint_path) => { + log::info!("Loading model from checkpoint: {}", checkpoint_path); + match config.model_arch.as_str() { + "rt_cnn_lstm" => Box::new(RTCNNLSTMModel::new(checkpoint_path, None, 0, 8, 4, true, device.clone())?), + "rt_cnn_tf" => Box::new(RTCNNTFModel::new(checkpoint_path, None, 0, 8, 4, true, device.clone())?), + _ => return Err(anyhow::anyhow!("Unsupported model architecture: {}", config.model_arch)), + } + } + None => { + match config.model_arch.as_str() { + "rt_cnn_lstm" => Box::new(RTCNNLSTMModel::new_untrained(device.clone())?), + "rt_cnn_tf" => Box::new(RTCNNTFModel::new_untrained(device.clone())?), + _ => return Err(anyhow::anyhow!("Unsupported model architecture: {}", config.model_arch)), + } + } }; + + + log::trace!("Model loaded successfully"); + + log::trace!("Loading modifications map"); let modifications = load_modifications().context("Failed to load modifications")?; let start_time = std::time::Instant::now(); - model.train( + log::trace!("Training started"); + let epoch_losses = model.train( &train_peptides, val_peptides.as_ref(), - modifications, + modifications.clone(), config.batch_size, + config.validation_batch_size.unwrap_or(config.batch_size), config.learning_rate as f64, config.epochs, config.early_stopping_patience, )?; log::info!("Training completed in {:?}", start_time.elapsed()); + // Generate report + let mut report = Report::new( + "ReDeeM", + &config.version, + Some("https://github.com/singjc/redeem/blob/master/img/redeem_logo.png?raw=true"), + "ReDeeM Trainer Report", + ); + + /* Section 1: Overview */ + { + let mut overview_section = ReportSection::new("Overview"); + + overview_section.add_content(html! { + "This report summarizes the training process of the ReDeeM model." + }); + + let losses_plot = plot_losses(&epoch_losses); + overview_section.add_plot(losses_plot); + + // Lets perform inference on 1000 random samples from the validation set + let val_peptides: Vec = sample_peptides(&val_peptides.as_ref().unwrap(), 1000); + let inference_results: Vec = model.inference( + &val_peptides, + config.batch_size, + modifications, + norm_factor, + )?; + let (true_rt, pred_rt): (Vec, Vec) = val_peptides + .iter() + .zip(&inference_results) + .filter_map(|(true_pep, pred_pep)| { + match (true_pep.retention_time, pred_pep.retention_time) { + (Some(t), Some(p)) => { + let t_denorm = t as f64 * norm_factor.unwrap().1 as f64 + norm_factor.unwrap().0 as f64; // de-normalized true RT + Some((t_denorm, p as f64)) // assume predicted is already de-normalized + }, + _ => None, + } + }) + .unzip(); + + + let scatter_plot = plot_scatter( + &vec![true_rt.clone()], + &vec![pred_rt.clone()], + vec!["RT Prediction".to_string()], + "Predicted vs True RT", + "Target RT", + "Predicted RT" + ).unwrap(); + overview_section.add_plot(scatter_plot); + report.add_section(overview_section); + } + + /* Section 2: Configuration */ + { + let mut config_section = ReportSection::new("Configuration"); + config_section.add_content(html! { + style { + ".code-container { + background-color: #f5f5f5; + padding: 10px; + border-radius: 5px; + overflow-x: auto; + font-family: monospace; + white-space: pre-wrap; + }" + } + div class="code-container" { + pre { + code { (PreEscaped(serde_json::to_string_pretty(&config)?)) } + } + } + }); + report.add_section(config_section); + } + + // Save the report to HTML file + let path = "redeem_trainer_report.html"; + report.save_to_file(&path.to_string())?; + model.save(&config.output_file)?; log::info!("Model saved to: {}", config.output_file); + let path = "redeem_trainer_config.json"; + let json = serde_json::to_string_pretty(&config)?; + println!("{}", json); + let bytes = serde_json::to_vec_pretty(&config)?; + write_bytes_to_file(path, &bytes)?; + Ok(()) } diff --git a/crates/redeem-cli/src/properties/util.rs b/crates/redeem-cli/src/properties/util.rs new file mode 100644 index 0000000..27ef965 --- /dev/null +++ b/crates/redeem-cli/src/properties/util.rs @@ -0,0 +1,26 @@ +use anyhow::Result; +use std::{fs::File, io::Write, path::{Path, PathBuf}}; + + +pub fn validate_tsv_or_csv_file(path: &str) -> Result<()> { + let pb = PathBuf::from(path); + + let ext = pb.extension().and_then(|s| s.to_str()).map(|s| s.to_lowercase()); + match ext.as_deref() { + Some("tsv") | Some("csv") => {} + _ => anyhow::bail!("File must have a .tsv or .csv extension: {}", path), + } + + if !pb.exists() { + anyhow::bail!("File does not exist: {}", path); + } + + Ok(()) +} + +pub fn write_bytes_to_file(path: &str, bytes: &[u8]) -> std::io::Result<()> { + let path = Path::new(path); + let mut file = File::create(path)?; + file.write_all(bytes)?; + Ok(()) +} \ No newline at end of file From 0b68ac33ae40be098b897baa799d145b85cf88f1 Mon Sep 17 00:00:00 2001 From: singjc Date: Sat, 10 May 2025 21:45:15 -0400 Subject: [PATCH 25/75] refactor: Update Dockerfile to optimize build process and clean up artifacts --- Dockerfile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 4679edb..c723776 100644 --- a/Dockerfile +++ b/Dockerfile @@ -34,7 +34,8 @@ ENV CUDA_COMPUTE_CAP=70 WORKDIR /app # Copy the source code into the container -COPY . . +COPY Cargo.toml Cargo.lock ./ +COPY crates ./crates # Build the application with CUDA support RUN cargo build --release --bin redeem --features cuda @@ -42,5 +43,8 @@ RUN cargo build --release --bin redeem --features cuda # Copy the binary into the PATH RUN cp target/release/redeem /app/redeem +# clean up build artifacts +RUN cargo clean + # Set the PATH environment variable ENV PATH="/app:${PATH}" \ No newline at end of file From e4bfaf9193f89f7b42920c1f31e09ddf6d45b757 Mon Sep 17 00:00:00 2001 From: singjc Date: Sat, 10 May 2025 22:02:28 -0400 Subject: [PATCH 26/75] add: Encoder26aaModChargeCnnTransformerAttnSum implementation --- .../src/building_blocks/building_blocks.rs | 126 ++++++++++++++++-- 1 file changed, 118 insertions(+), 8 deletions(-) diff --git a/crates/redeem-properties/src/building_blocks/building_blocks.rs b/crates/redeem-properties/src/building_blocks/building_blocks.rs index edd7250..c1e164d 100644 --- a/crates/redeem-properties/src/building_blocks/building_blocks.rs +++ b/crates/redeem-properties/src/building_blocks/building_blocks.rs @@ -998,19 +998,13 @@ impl Encoder26aaModChargeCnnLstmAttnSum { pub fn forward(&self, aa_indices: &Tensor, mod_x: &Tensor, charges: &Tensor) -> Result { - let start_time = Instant::now(); let mod_x = self.mod_nn.forward(mod_x)?; - log::trace!("Encoder26aaModChargeCnnLstmAttnSum::forward - mod_x forward time: {:.3?}", start_time.elapsed()); - let start_time = Instant::now(); + let charges_repeated = charges.unsqueeze(1)?.repeat(&[1, mod_x.dim(1)?, 1])?; - log::trace!("Encoder26aaModChargeCnnLstmAttnSum::forward - charges_repeated forward time: {:.3?}", start_time.elapsed()); - let start_time = Instant::now(); let additional_tensors: Vec<&Tensor> = vec![&mod_x, &charges_repeated]; - log::trace!("Encoder26aaModChargeCnnLstmAttnSum::forward - additional_tensors forward time: {:.3?}", start_time.elapsed()); - let start_time = Instant::now(); + let x = aa_one_hot(&aa_indices, &additional_tensors) .map_err(|e| candle_core::Error::Msg(e.to_string()))?; - log::trace!("Encoder26aaModChargeCnnLstmAttnSum::forward - aa_one_hot forward time: {:.3?}", start_time.elapsed()); let start_time = Instant::now(); let x = self.input_cnn.forward(&x)?; @@ -1152,6 +1146,122 @@ impl Encoder26aaModCnnTransformerAttnSum { } +/// Encode AAs (26 AA letters), modifications and Charge state using CNN + Transformer + AttentionSum. +#[derive(Debug, Clone)] +pub struct Encoder26aaModChargeCnnTransformerAttnSum { + mod_nn: ModEmbeddingFixFirstK, + input_cnn: SeqCNN, + input_transformer: SeqTransformer, + attn_sum: SeqAttentionSum, +} + +impl Encoder26aaModChargeCnnTransformerAttnSum { + pub fn from_varstore( + varstore: &nn::VarBuilder, + mod_hidden_dim: usize, + hidden_dim: usize, + ff_dim: usize, + num_heads: usize, + num_layers: usize, + max_len: usize, + dropout_prob: f32, + names_mod_nn: Vec<&str>, + names_input_cnn_weight: Vec<&str>, + names_input_cnn_bias: Vec<&str>, + transformer_pp: &str, + names_attn_sum: Vec<&str>, + device: &Device, + ) -> Result { + let input_dim = AA_EMBEDDING_SIZE + mod_hidden_dim; + Ok(Self { + mod_nn: ModEmbeddingFixFirstK::from_varstore( + &varstore, + MOD_FEATURE_SIZE, + mod_hidden_dim, + names_mod_nn[0], + )?, + input_cnn: SeqCNN::from_varstore( + varstore.clone(), + input_dim, + names_input_cnn_weight, + names_input_cnn_bias, + )?, + input_transformer: SeqTransformer::from_varstore( + varstore.pp(transformer_pp).clone(), + input_dim * 4, + hidden_dim, + ff_dim, + num_heads, + num_layers, + max_len, + dropout_prob, + device, + )?, + attn_sum: SeqAttentionSum::from_varstore( + varstore.clone(), + hidden_dim, + names_attn_sum[0], + )?, + }) + } + + /// Construct a CNN+Transformer+Attention encoder from scratch (no pretrained weights). + pub fn new( + varbuilder: &nn::VarBuilder, + mod_hidden_dim: usize, + hidden_dim: usize, + ff_dim: usize, + num_heads: usize, + num_layers: usize, + max_len: usize, + dropout_prob: f32, + device: &Device, + ) -> Result { + let input_dim = AA_EMBEDDING_SIZE + mod_hidden_dim; + Ok(Self { + mod_nn: ModEmbeddingFixFirstK::new(MOD_FEATURE_SIZE, mod_hidden_dim, &varbuilder.pp("mod_nn"))?, + input_cnn: SeqCNN::new(input_dim, &varbuilder.pp("input_cnn"))?, + input_transformer: SeqTransformer::new( + &varbuilder.pp("input_transformer"), + input_dim * 4, + hidden_dim, + ff_dim, + num_heads, + num_layers, + max_len, + dropout_prob, + device, + )?, + attn_sum: SeqAttentionSum::new(hidden_dim, &varbuilder.pp("attn_sum"))?, + }) + } + + pub fn forward(&self, aa_indices: &Tensor, mod_x: &Tensor, charges: &Tensor) -> Result { + let mod_x = self.mod_nn.forward(mod_x)?; + let charges_repeated = charges.unsqueeze(1)?.repeat(&[1, mod_x.dim(1)?, 1])?; + + let additional_tensors: Vec<&Tensor> = vec![&mod_x, &charges_repeated]; + let x = aa_one_hot(aa_indices, &additional_tensors) + .map_err(|e| candle_core::Error::Msg(e.to_string()))?; + let (mean, min, max) = get_tensor_stats(&x)?; + log::debug!("[Encoder26aaModChargeCnnTransformerAttnSum] one-hot output stats - min: {min}, max: {max}, mean: {mean}"); + + let x = self.input_cnn.forward(&x)?; + let (mean, min, max) = get_tensor_stats(&x)?; + log::debug!("[Encoder26aaModChargeCnnTransformerAttnSum] input_cnn output stats - min: {min}, max: {max}, mean: {mean}"); + + let x = self.input_transformer.forward(&x)?; + let (mean, min, max) = get_tensor_stats(&x)?; + log::debug!("[Encoder26aaModChargeCnnTransformerAttnSum] input_transformer output stats - min: {min}, max: {max}, mean: {mean}"); + + let x = self.attn_sum.forward(&x)?; + + let (mean, min, max) = get_tensor_stats(&x)?; + log::debug!("[Encoder26aaModChargeCnnTransformerAttnSum] attn_sum output stats - min: {min}, max: {max}, mean: {mean}"); + + Ok(x) + } +} #[cfg(test)] mod tests { From 146dedd4b1b70e27d82bd54d3c9d5a021a55e45a Mon Sep 17 00:00:00 2001 From: singjc Date: Sat, 10 May 2025 22:15:49 -0400 Subject: [PATCH 27/75] refactor: Add CCSCNNTFModel implementation --- .../src/models/ccs_cnn_tf_model.rs | 245 ++++++++++++++++++ .../redeem-properties/src/models/ccs_model.rs | 11 +- crates/redeem-properties/src/models/mod.rs | 7 +- 3 files changed, 257 insertions(+), 6 deletions(-) create mode 100644 crates/redeem-properties/src/models/ccs_cnn_tf_model.rs diff --git a/crates/redeem-properties/src/models/ccs_cnn_tf_model.rs b/crates/redeem-properties/src/models/ccs_cnn_tf_model.rs new file mode 100644 index 0000000..e821546 --- /dev/null +++ b/crates/redeem-properties/src/models/ccs_cnn_tf_model.rs @@ -0,0 +1,245 @@ +use anyhow::Result; +use candle_core::{DType, Device, IndexOp, Tensor}; +use candle_nn::{Dropout, Module, VarBuilder, VarMap}; +use std::collections::HashMap; +use std::path::Path; + +use crate::building_blocks::building_blocks::{ + DecoderLinear, Encoder26aaModChargeCnnTransformerAttnSum, MOD_FEATURE_SIZE, +}; +use crate::models::model_interface::{ModelInterface, PropertyType, load_tensors_from_model, create_var_map}; +use crate::utils::peptdeep_utils::{ + load_mod_to_feature, + parse_model_constants, ModelConstants, +}; +use crate::utils::utils::get_tensor_stats; + +// Constants +const CHARGE_FACTOR: f64 = 0.1; +const NCE_FACTOR: f64 = 0.01; + +// Main Model Struct + +#[derive(Clone)] +/// Represents an CNN-TF Collision Cross Section model. +pub struct CCSCNNTFModel { + var_store: VarBuilder<'static>, + varmap: VarMap, + constants: ModelConstants, + device: Device, + mod_to_feature: HashMap>, + dropout: Dropout, + ccs_encoder: Encoder26aaModChargeCnnTransformerAttnSum, + ccs_decoder: DecoderLinear, + is_training: bool, +} + +// Automatically implement Send and Sync if all fields are Send and Sync +unsafe impl Send for CCSCNNTFModel {} +unsafe impl Sync for CCSCNNTFModel {} + +// Core Model Implementation + +impl ModelInterface for CCSCNNTFModel { + fn property_type(&self) -> PropertyType { + PropertyType::CCS + } + + fn model_arch(&self) -> &'static str { + "ccs_cnn_tf" + } + + fn new_untrained(device: Device) -> Result { + let mut varmap = VarMap::new(); + let varbuilder = VarBuilder::from_varmap(&varmap, DType::F32, &device); + + log::trace!("[CCSCNNTFModel] Initializing ccs_encoder"); + let ccs_encoder = Encoder26aaModChargeCnnTransformerAttnSum::new( + &varbuilder.pp("ccs_encoder"), + 8, // mod_hidden_dim + 140, // hidden_dim + 256, // ff_dim + 4, // num_heads + 2, // num_layers + 100, // max_len + 0.1, // dropout_prob + &device + )?; + + log::trace!("[CCSCNNTFModel] Initializing ccs_decoder"); + let ccs_decoder = DecoderLinear::new(141, 1, &varbuilder.pp("ccs_decoder"))?; + let constants = ModelConstants::default(); + let mod_to_feature = load_mod_to_feature(&constants)?; + + Ok(Self { + var_store: varbuilder, + varmap, + constants, + device, + mod_to_feature, + dropout: Dropout::new(0.1), + ccs_encoder, + ccs_decoder, + is_training: true, + }) + } + + /// Create a new CCSCNNTFModel from the given model and constants files. + fn new>( + model_path: P, + constants_path: Option

, + _fixed_sequence_len: usize, + _num_frag_types: usize, + _num_modloss_types: usize, + _mask_modloss: bool, + device: Device, + ) -> Result { + let tensor_data = load_tensors_from_model(model_path.as_ref(), &device)?; + let mut varmap = candle_nn::VarMap::new(); + create_var_map(&mut varmap, tensor_data, &device)?; + let var_store = candle_nn::VarBuilder::from_varmap(&varmap, DType::F32, &device); + + let constants = match constants_path { + Some(path) => parse_model_constants(path.as_ref().to_str().unwrap())?, + None => ModelConstants::default(), + }; + + let mod_to_feature = load_mod_to_feature(&constants)?; + let dropout = Dropout::new(0.1); + + let ccs_encoder = Encoder26aaModChargeCnnTransformerAttnSum::from_varstore( + &var_store, + 8, // mod_hidden_dim + 140, // hidden_dim + 256, // ff_dim + 4, // num_heads + 2, // num_layers + 100, // max_len (set appropriately for your sequence length) + 0.1, // dropout_prob + vec!["ccs_encoder.mod_nn.nn.weight"], + vec![ + "ccs_encoder.input_cnn.cnn_short.weight", + "ccs_encoder.input_cnn.cnn_medium.weight", + "ccs_encoder.input_cnn.cnn_long.weight", + ], + vec![ + "ccs_encoder.input_cnn.cnn_short.bias", + "ccs_encoder.input_cnn.cnn_medium.bias", + "ccs_encoder.input_cnn.cnn_long.bias", + ], + "ccs_encoder.input_transformer", + vec!["ccs_encoder.attn_sum.attn.0.weight"], + &device, + )?; + + + let ccs_decoder = DecoderLinear::from_varstore( + &var_store, + 141, + 1, + vec!["ccs_decoder.nn.0.weight", "ccs_decoder.nn.1.weight", "ccs_decoder.nn.2.weight"], + vec!["ccs_decoder.nn.0.bias", "ccs_decoder.nn.2.bias"] + )?; + + Ok(Self { + var_store, + varmap, + constants, + device, + mod_to_feature, + dropout, + ccs_encoder, + ccs_decoder, + is_training: false, + }) + } + + fn forward(&self, xs: &Tensor) -> Result { + let (_batch_size, _seq_len, _) = xs.shape().dims3()?; + + // Separate input into aa_indices, mod_x, charge + let start_mod_x = 1; + let start_charge = start_mod_x + MOD_FEATURE_SIZE; + + let aa_indices_out = xs.i((.., .., 0))?; + let mod_x_out = xs.i((.., .., start_mod_x..start_mod_x + MOD_FEATURE_SIZE))?; + let charge_out = xs.i((.., 0..1, start_charge..start_charge + 1))?; + let charge_out = charge_out.squeeze(2)?; + + let x = self.ccs_encoder.forward(&aa_indices_out, &mod_x_out, &charge_out)?; + let (mean, min, max) = get_tensor_stats(&x)?; + log::debug!("[CCSCNNTFModel] ccs_encoder output stats - min: {min}, max: {max}, mean: {mean}"); + + let x = self.dropout.forward(&x, self.is_training)?; + log::trace!("[CCSCNNTFModel] x.shape after dropout: {:?}", x.shape()); + let (mean, min, max) = get_tensor_stats(&x)?; + log::debug!("[CCSCNNTFModel] dropout output stats - min: {min}, max: {max}, mean: {mean}"); + + let x = self.ccs_decoder.forward(&x)?; + log::trace!("[CCSCNNTFModel] x.shape after ccs_decoder: {:?}", x.shape()); + let (mean, min, max) = get_tensor_stats(&x)?; + log::debug!("[CCSCNNTFModel] ccs_decoder output stats - min: {min}, max: {max}, mean: {mean}"); + Ok(x.squeeze(1)?) + } + + /// Set model to evaluation mode for inference + /// This disables dropout and other training-specific layers. + fn set_evaluation_mode(&mut self) { + // println!("Setting evaluation mode"); + self.is_training = false; + } + + /// Set model to training mode for training + /// This enables dropout and other training-specific layers. + fn set_training_mode(&mut self) { + self.is_training = true; + } + + fn get_property_type(&self) -> String { + self.property_type().clone().as_str().to_string() + } + + fn get_model_arch(&self) -> String { + self.model_arch().to_string() + } + + fn get_device(&self) -> &Device { + &self.device + } + + fn get_mod_element_count(&self) -> usize { + self.constants.mod_elements.len() + } + + fn get_mod_to_feature(&self) -> &HashMap> { + &self.mod_to_feature + } + + fn get_min_pred_intensity(&self) -> f32 { + unimplemented!("Method not implemented for architecture: {}", self.model_arch()) + } + + fn get_mut_varmap(&mut self) -> &mut VarMap { + &mut self.varmap + } + + /// Print a summary of the model's constants. + fn print_summary(&self) { + println!("CCSModel Summary:"); + println!("AA Embedding Size: {}", self.constants.aa_embedding_size.unwrap()); + println!("Charge Factor: {:?}", self.constants.charge_factor); + println!("Instruments: {:?}", self.constants.instruments); + println!("Max Instrument Num: {}", self.constants.max_instrument_num); + println!("Mod Elements: {:?}", self.constants.mod_elements); + println!("NCE Factor: {:?}", self.constants.nce_factor); + } + + /// Print the model's weights. + fn print_weights(&self) { + todo!("Implement print_weights for CCSCNNTFModel"); + } + + +} + + diff --git a/crates/redeem-properties/src/models/ccs_model.rs b/crates/redeem-properties/src/models/ccs_model.rs index c6b719a..7befcf3 100644 --- a/crates/redeem-properties/src/models/ccs_model.rs +++ b/crates/redeem-properties/src/models/ccs_model.rs @@ -3,6 +3,7 @@ use candle_core::Device; use anyhow::{Result, anyhow}; use crate::models::model_interface::{ModelInterface,PredictionResult}; use crate::models::ccs_cnn_lstm_model::CCSCNNLSTMModel; +use crate::models::ccs_cnn_tf_model::CCSCNNTFModel; use crate::utils::data_handling::PeptideData; use std::collections::HashMap; use crate::utils::peptdeep_utils::ModificationMap; @@ -10,11 +11,11 @@ use crate::utils::peptdeep_utils::ModificationMap; // Enum for different types of CCS models pub enum CCSModelArch { CCSCNNLSTM, - // Add other architectures here as needed + CCSCNNTF, } // Constants for different types of CCS models -pub const CCSMODEL_ARCHS: &[&str] = &["ccs_cnn_lstm"]; +pub const CCSMODEL_ARCHS: &[&str] = &["ccs_cnn_lstm", "ccs_cnn_tf"]; // A wrapper struct for CCS models pub struct CCSModelWrapper { @@ -33,7 +34,7 @@ impl CCSModelWrapper { pub fn new>(model_path: P, constants_path: P, arch: &str, device: Device) -> Result { let model: Box = match arch { "ccs_cnn_lstm" => Box::new(CCSCNNLSTMModel::new(model_path, Some(constants_path), 0, 8, 4, true, device)?), - // Add other cases here as you implement more models + "ccs_cnn_tf" => Box::new(CCSCNNTFModel::new(model_path, Some(constants_path), 0, 8, 4, true, device)?), _ => return Err(anyhow!("Unsupported CCS model architecture: {}", arch)), }; @@ -44,6 +45,10 @@ impl CCSModelWrapper { self.model.predict(peptide_sequence, mods, mod_sites, Some(charge), None, None) } + pub fn train(&mut self, training_data: &Vec, val_data: Option<&Vec>, modifications: HashMap<(String, Option), ModificationMap>, batch_size: usize, val_batch_size: usize, learning_rate: f64, epochs: usize, early_stopping_patience: usize) -> Result, f32, Option)>> { + self.model.train(training_data, val_data, modifications, batch_size, val_batch_size, learning_rate, epochs, early_stopping_patience) + } + pub fn fine_tune(&mut self, training_data: &Vec, modifications: HashMap<(String, Option), ModificationMap>, batch_size: usize, learning_rate: f64, epochs: usize) -> Result<()> { self.model.fine_tune(training_data, modifications, batch_size, learning_rate, epochs) } diff --git a/crates/redeem-properties/src/models/mod.rs b/crates/redeem-properties/src/models/mod.rs index 502cb15..b7bd4d6 100644 --- a/crates/redeem-properties/src/models/mod.rs +++ b/crates/redeem-properties/src/models/mod.rs @@ -1,8 +1,9 @@ pub mod rt_model; pub mod rt_cnn_lstm_model; -pub mod ms2_bert_model; -pub mod ccs_cnn_lstm_model; +pub mod rt_cnn_transformer_model; pub mod ccs_model; +pub mod ccs_cnn_lstm_model; +pub mod ccs_cnn_tf_model; +pub mod ms2_bert_model; pub mod ms2_model; pub mod model_interface; -pub mod rt_cnn_transformer_model; From ddc39f15435a882f78961fbf4517e13329e09320 Mon Sep 17 00:00:00 2001 From: singjc Date: Sat, 10 May 2025 22:15:57 -0400 Subject: [PATCH 28/75] refactor: Update RTCNNTFModel implementation and remove unused code --- .../src/models/rt_cnn_transformer_model.rs | 110 ------------------ 1 file changed, 110 deletions(-) diff --git a/crates/redeem-properties/src/models/rt_cnn_transformer_model.rs b/crates/redeem-properties/src/models/rt_cnn_transformer_model.rs index 6008137..09f1b07 100644 --- a/crates/redeem-properties/src/models/rt_cnn_transformer_model.rs +++ b/crates/redeem-properties/src/models/rt_cnn_transformer_model.rs @@ -4,12 +4,9 @@ use candle_nn::{Dropout, Module, VarBuilder, VarMap}; use std::collections::HashMap; use std::path::Path; - - use crate::building_blocks::building_blocks::{ DecoderLinear, Encoder26aaModCnnTransformerAttnSum, MOD_FEATURE_SIZE, }; -use crate::building_blocks::nn; use crate::models::model_interface::{ModelInterface, PropertyType, load_tensors_from_model, create_var_map}; use crate::utils::peptdeep_utils::{ load_mod_to_feature, @@ -249,111 +246,4 @@ impl ModelInterface for RTCNNTFModel { } -// Module Trait Implementation - -// impl Module for RTCNNLSTMModel { -// fn forward(&self, input: &Tensor) -> Result { -// ModelInterface::forward(self, input) -// } -// } - - -#[cfg(test)] -mod tests { - use crate::models::model_interface::ModelInterface; - use crate::models::rt_cnn_lstm_model::RTCNNLSTMModel; - use candle_core::Device; - use std::path::PathBuf; - - use super::*; - - #[test] - fn test_parse_model_constants() { - let path = "data/models/alphapeptdeep/generic/rt.pth.model_const.yaml"; - let result = parse_model_constants(path); - assert!(result.is_ok()); - let constants = result.unwrap(); - assert_eq!(constants.aa_embedding_size.unwrap(), 27); - assert_eq!(constants.charge_factor, Some(0.1)); - assert_eq!(constants.instruments.len(), 4); - assert_eq!(constants.max_instrument_num, 8); - assert_eq!(constants.mod_elements.len(), 109); - assert_eq!(constants.nce_factor, Some(0.01)); - } - - #[test] - fn test_encode_peptides() { - let model_path = PathBuf::from("data/models/alphapeptdeep/generic/rt.pth"); - let constants_path = - PathBuf::from("data/models/alphapeptdeep/generic/rt.pth.model_const.yaml"); - let device = Device::Cpu; - let model = RTCNNLSTMModel::new(&model_path, Some(&constants_path), 0, 8, 4, true, device).unwrap(); - - let peptide_sequences = "AGHCEWQMKYR"; - let mods = "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M"; - let mod_sites = "0;4;8"; - // let charge = Some(2); - // let nce = Some(20); - // let instrument = Some("QE"); - - let result = - model.encode_peptide(&peptide_sequences, mods, mod_sites, None, None, None); - - println!("{:?}", result); - - // assert!(result.is_ok()); - // let encoded_peptides = result.unwrap(); - // assert_eq!(encoded_peptides.shape().dims2().unwrap(), (1, 27 + 109 + 1 + 1 + 1)); - } - #[test] - fn test_encode_peptides_batch() { - - let model_path = PathBuf::from("data/models/alphapeptdeep/generic/rt.pth"); - let constants_path = PathBuf::from("data/models/alphapeptdeep/generic/rt.pth.model_const.yaml"); - let device = Device::Cpu; - - let model = RTCNNLSTMModel::new(&model_path, Some(&constants_path), 0, 8, 4, true, device.clone()).unwrap(); - - // Batched input - let peptide_sequences = vec![ - "ACDEFGHIK".to_string(), - "AGHCEWQMKYR".to_string(), - ]; - let mods = vec![ - "Carbamidomethyl@C".to_string(), - "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M".to_string(), - ]; - let mod_sites = vec![ - "1".to_string(), - "0;4;8".to_string(), - ]; - - println!("Peptides: {:?}", peptide_sequences); - println!("Mods: {:?}", mods); - println!("Mod sites: {:?}", mod_sites); - - - let result = model.encode_peptides( - &peptide_sequences, - &mods, - &mod_sites, - None, - None, - None, - ); - - assert!(result.is_ok()); - let tensor = result.unwrap(); - println!("Batched encoded tensor shape: {:?}", tensor.shape()); - - let (batch, seq_len, feat_dim) = tensor.shape().dims3().unwrap(); - assert_eq!(batch, 2); // two peptides - assert!(seq_len >= 11); // padded to max length - assert!(feat_dim > 1); // includes aa + mod features - } - - - - -} From b0e0f22887858303faaa74a819a335edea34865f Mon Sep 17 00:00:00 2001 From: singjc Date: Sat, 10 May 2025 22:21:23 -0400 Subject: [PATCH 29/75] update: readme --- README.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 8afa499..d27e5f2 100644 --- a/README.md +++ b/README.md @@ -30,15 +30,17 @@ The ReDeeM project consists of two primary crates: 1. **redeem-properties**: - This crate focuses on deep learning models for peptide property prediction. It implements models for predicting retention time (RT), ion mobility (IM), and MS2 fragment intensities using the Candle library. - - The models can be fine-tuned on new data and can be saved in the safetensor format for later use. + - The models can be trained, fine-tuned on new data and can be saved in the safetensor format for later use. - Current Models Model | Name | Architecture | Implemented --- | --- | --- | --- - AlphaPept RT Model | `redeem_properties::RTCNNLSTMModel` | CNN-LSTM | :heavy_check_mark: - AlphaPept MS2 Model | `redeem_properties::MS2BertModel` | Bert | :heavy_check_mark: - AlphaPept IM Model | `redeem_properties::CCSCNNLSTMModel` | CNN-LSTM | :heavy_check_mark: + AlphaPept RT Model | `rt_cnn_lstm` | CNN-LSTM | :heavy_check_mark: + AlphaPept MS2 Model | `ms2_bert` | Bert | :heavy_check_mark: + AlphaPept CCS Model | `ccs_cnn_lstm` | CNN-LSTM | :heavy_check_mark: + RT Model | `rt_tf_lstm` | CNN-Transformer | :heavy_check_mark: + CCS Model | `ccs_tf_lstm` | CNN-Transformer | :heavy_check_mark: 2. **redeem-classifiers**: - This crate is aimed at developing semi-supervised scoring classifier models. The goal is to create models for separating target peptides from decoys. From 87797f37122714a935e770b02fada093f9542488 Mon Sep 17 00:00:00 2001 From: singjc Date: Sun, 11 May 2025 00:14:37 -0400 Subject: [PATCH 30/75] refactor: Improve regex pattern for extracting modification indices in peptdeep_utils.rs --- .../redeem-properties/src/utils/peptdeep_utils.rs | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/crates/redeem-properties/src/utils/peptdeep_utils.rs b/crates/redeem-properties/src/utils/peptdeep_utils.rs index f477527..3a6702e 100644 --- a/crates/redeem-properties/src/utils/peptdeep_utils.rs +++ b/crates/redeem-properties/src/utils/peptdeep_utils.rs @@ -328,8 +328,20 @@ pub fn extract_masses_and_indices(peptide: &str) -> Vec<(f64, usize)> { } +/// Extracts modification indices from a peptide string. +/// The indices are 0-based and represent the positions of the modifications. +/// +/// # Example +/// ``` +/// use redeem_properties::utils::peptdeep_utils::get_modification_indices; +/// let result = get_modification_indices("AC[+57.0215]DE"); +/// assert_eq!(result, "1"); +/// +/// let result = get_modification_indices("AC(UniMod:4)DE"); +/// assert_eq!(result, "1"); +/// ``` pub fn get_modification_indices(peptide: &str) -> String { - let re = Regex::new(r"\[.*?\]").unwrap(); + let re = Regex::new(r"(\[.*?\]|\(UniMod:\d+\)|\([a-zA-Z]+\))").unwrap(); let mut indices = Vec::new(); let mut offset = 1; // Offset by 1 for 0-based index From 410074796dce184b32fa526c1e621ce5260fe3b3 Mon Sep 17 00:00:00 2001 From: singjc Date: Sun, 11 May 2025 00:14:51 -0400 Subject: [PATCH 31/75] refactor: Update redeem-properties crate models for CCS prediction --- .../src/properties/train/trainer.rs | 126 +++++++++++++----- .../examples/alphapeptdeep_ccs_cnn_lstm.rs | 2 +- .../src/models/model_interface.rs | 22 +-- 3 files changed, 102 insertions(+), 48 deletions(-) diff --git a/crates/redeem-cli/src/properties/train/trainer.rs b/crates/redeem-cli/src/properties/train/trainer.rs index f16a4bf..45ed0fe 100644 --- a/crates/redeem-cli/src/properties/train/trainer.rs +++ b/crates/redeem-cli/src/properties/train/trainer.rs @@ -1,40 +1,51 @@ use anyhow::{Context, Result}; +use maud::{PreEscaped, html}; use redeem_properties::models::model_interface::ModelInterface; use redeem_properties::models::rt_model::load_retention_time_model; -use redeem_properties::models::{rt_cnn_lstm_model::RTCNNLSTMModel, rt_cnn_transformer_model::RTCNNTFModel}; +use redeem_properties::models::{ + ccs_cnn_lstm_model::CCSCNNLSTMModel, ccs_cnn_tf_model::CCSCNNTFModel, + rt_cnn_lstm_model::RTCNNLSTMModel, rt_cnn_transformer_model::RTCNNTFModel, +}; use redeem_properties::utils::data_handling::PeptideData; use redeem_properties::utils::peptdeep_utils::load_modifications; use redeem_properties::utils::utils::get_device; use report_builder::{ - plots::{plot_boxplot, plot_pp, plot_scatter, plot_score_histogram}, Report, ReportSection, + plots::plot_scatter, }; -use maud::{html, PreEscaped}; -use input::PropertyTrainConfig; -use load_data::load_peptide_data; use crate::properties::load_data; use crate::properties::train::plot::plot_losses; use crate::properties::train::sample_peptides; use crate::properties::util::write_bytes_to_file; +use input::PropertyTrainConfig; +use load_data::load_peptide_data; use super::input; pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { - // Load training data - let (train_peptides, norm_factor) = load_peptide_data(&config.train_data, Some(config.nce), Some(config.instrument.clone()), true)?; + let (train_peptides, norm_factor) = load_peptide_data( + &config.train_data, + Some(config.nce), + Some(config.instrument.clone()), + true, + )?; log::info!("Loaded {} training peptides", train_peptides.len()); // Load validation data if specified let (val_peptides, _val_norm_factor) = if let Some(ref val_path) = config.validation_data { - let (peptides, norm) = load_peptide_data(val_path, Some(config.nce), Some(config.instrument.clone()), true) - .context("Failed to load validation data")?; + let (peptides, norm) = load_peptide_data( + val_path, + Some(config.nce), + Some(config.instrument.clone()), + true, + ) + .context("Failed to load validation data")?; (Some(peptides), Some(norm)) } else { (None, None) }; - if let Some(ref val_data) = val_peptides { log::info!("Loaded {} validation peptides", val_data.len()); @@ -45,29 +56,75 @@ pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { // Dispatch model training based on architecture let model_arch = config.model_arch.as_str(); let device = get_device(&config.device)?; - log::trace!("Loading model architecture: {} on device: {:?}", model_arch, device); + log::trace!( + "Loading model architecture: {} on device: {:?}", + model_arch, + device + ); let mut model: Box = match &config.checkpoint_file { Some(checkpoint_path) => { log::info!("Loading model from checkpoint: {}", checkpoint_path); match config.model_arch.as_str() { - "rt_cnn_lstm" => Box::new(RTCNNLSTMModel::new(checkpoint_path, None, 0, 8, 4, true, device.clone())?), - "rt_cnn_tf" => Box::new(RTCNNTFModel::new(checkpoint_path, None, 0, 8, 4, true, device.clone())?), - _ => return Err(anyhow::anyhow!("Unsupported model architecture: {}", config.model_arch)), + "rt_cnn_lstm" => Box::new(RTCNNLSTMModel::new( + checkpoint_path, + None, + 0, + 8, + 4, + true, + device.clone(), + )?), + "rt_cnn_tf" => Box::new(RTCNNTFModel::new( + checkpoint_path, + None, + 0, + 8, + 4, + true, + device.clone(), + )?), + "ccs_cnn_lstm" => Box::new(CCSCNNLSTMModel::new( + checkpoint_path, + None, + 0, + 8, + 4, + true, + device.clone(), + )?), + "ccs_cnn_tf" => Box::new(CCSCNNTFModel::new( + checkpoint_path, + None, + 0, + 8, + 4, + true, + device.clone(), + )?), + _ => { + return Err(anyhow::anyhow!( + "Unsupported model architecture: {}", + config.model_arch + )); + } } } - None => { - match config.model_arch.as_str() { - "rt_cnn_lstm" => Box::new(RTCNNLSTMModel::new_untrained(device.clone())?), - "rt_cnn_tf" => Box::new(RTCNNTFModel::new_untrained(device.clone())?), - _ => return Err(anyhow::anyhow!("Unsupported model architecture: {}", config.model_arch)), + None => match config.model_arch.as_str() { + "rt_cnn_lstm" => Box::new(RTCNNLSTMModel::new_untrained(device.clone())?), + "rt_cnn_tf" => Box::new(RTCNNTFModel::new_untrained(device.clone())?), + "ccs_cnn_lstm" => Box::new(CCSCNNLSTMModel::new_untrained(device.clone())?), + "ccs_cnn_tf" => Box::new(CCSCNNTFModel::new_untrained(device.clone())?), + _ => { + return Err(anyhow::anyhow!( + "Unsupported model architecture: {}", + config.model_arch + )); } - } + }, }; - - + log::trace!("Model loaded successfully"); - log::trace!("Loading modifications map"); let modifications = load_modifications().context("Failed to load modifications")?; @@ -107,37 +164,34 @@ pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { // Lets perform inference on 1000 random samples from the validation set let val_peptides: Vec = sample_peptides(&val_peptides.as_ref().unwrap(), 1000); - let inference_results: Vec = model.inference( - &val_peptides, - config.batch_size, - modifications, - norm_factor, - )?; + let inference_results: Vec = + model.inference(&val_peptides, config.batch_size, modifications, norm_factor)?; let (true_rt, pred_rt): (Vec, Vec) = val_peptides .iter() .zip(&inference_results) .filter_map(|(true_pep, pred_pep)| { match (true_pep.retention_time, pred_pep.retention_time) { (Some(t), Some(p)) => { - let t_denorm = t as f64 * norm_factor.unwrap().1 as f64 + norm_factor.unwrap().0 as f64; // de-normalized true RT - Some((t_denorm, p as f64)) // assume predicted is already de-normalized - }, + let t_denorm = t as f64 * norm_factor.unwrap().1 as f64 + + norm_factor.unwrap().0 as f64; // de-normalized true RT + Some((t_denorm, p as f64)) // assume predicted is already de-normalized + } _ => None, } }) .unzip(); - let scatter_plot = plot_scatter( &vec![true_rt.clone()], &vec![pred_rt.clone()], vec!["RT Prediction".to_string()], "Predicted vs True RT", "Target RT", - "Predicted RT" - ).unwrap(); + "Predicted RT", + ) + .unwrap(); overview_section.add_plot(scatter_plot); - report.add_section(overview_section); + report.add_section(overview_section); } /* Section 2: Configuration */ diff --git a/crates/redeem-properties/examples/alphapeptdeep_ccs_cnn_lstm.rs b/crates/redeem-properties/examples/alphapeptdeep_ccs_cnn_lstm.rs index 63973da..5b7f52b 100644 --- a/crates/redeem-properties/examples/alphapeptdeep_ccs_cnn_lstm.rs +++ b/crates/redeem-properties/examples/alphapeptdeep_ccs_cnn_lstm.rs @@ -45,7 +45,7 @@ fn run_prediction(model: &mut CCSCNNLSTMModel, prediction_context: &PredictionCo None, ) { Ok(predictions) => { - if let PredictionResult::IMResult(ccs_preds) = predictions { + if let PredictionResult::CCSResult(ccs_preds) = predictions { let total_error: f32 = ccs_preds .iter() .zip(prediction_context.observed_ccs.iter()) diff --git a/crates/redeem-properties/src/models/model_interface.rs b/crates/redeem-properties/src/models/model_interface.rs index 120dd48..9cb4f84 100644 --- a/crates/redeem-properties/src/models/model_interface.rs +++ b/crates/redeem-properties/src/models/model_interface.rs @@ -132,7 +132,7 @@ impl Index<(usize, usize)> for PredictionValue { #[derive(Debug, Clone)] pub enum PredictionResult { RTResult(Vec), - IMResult(Vec), + CCSResult(Vec), MS2Result(Vec>>), } @@ -140,7 +140,7 @@ impl PredictionResult { pub fn len(&self) -> usize { match self { PredictionResult::RTResult(vec) => vec.len(), - PredictionResult::IMResult(vec) => vec.len(), + PredictionResult::CCSResult(vec) => vec.len(), PredictionResult::MS2Result(vec) => vec.len(), } } @@ -148,7 +148,7 @@ impl PredictionResult { pub fn get_prediction_entry(&self, index: usize) -> PredictionValue { match self { PredictionResult::RTResult(vec) => PredictionValue::Single(vec[index].clone()), - PredictionResult::IMResult(vec) => PredictionValue::Single(vec[index].clone()), + PredictionResult::CCSResult(vec) => PredictionValue::Single(vec[index].clone()), PredictionResult::MS2Result(vec) => PredictionValue::Matrix(vec[index].clone()), } } @@ -266,7 +266,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { } PropertyType::CCS => { let predictions: Vec = output.to_vec1()?; - Ok(PredictionResult::IMResult(predictions)) + Ok(PredictionResult::CCSResult(predictions)) } PropertyType::MS2 => { let out = self.process_predictions(&output, self.get_min_pred_intensity())?; @@ -447,7 +447,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { let num_batches = (training_data.len() + batch_size - 1) / batch_size; info!( - "Training {} model from scratch on {} peptide features ({} batches) for {} epochs", + "Training {} model from on {} peptide features ({} batches) for {} epochs", self.get_model_arch(), training_data.len(), num_batches, @@ -491,7 +491,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { PropertyType::RT => PredictionResult::RTResult( batch_data.iter().map(|p| p.retention_time.unwrap_or_default()).collect(), ), - PropertyType::CCS => PredictionResult::IMResult( + PropertyType::CCS => PredictionResult::CCSResult( batch_data.iter().map(|p| p.ion_mobility.unwrap_or_default()).collect(), ), PropertyType::MS2 => { @@ -500,7 +500,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { }; let target_batch = match batch_targets { - PredictionResult::RTResult(ref values) | PredictionResult::IMResult(ref values) => { + PredictionResult::RTResult(ref values) | PredictionResult::CCSResult(ref values) => { Tensor::new(values.clone(), &self.get_device())? } PredictionResult::MS2Result(_) => unreachable!(), @@ -548,7 +548,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { PropertyType::RT => PredictionResult::RTResult( batch_data.iter().map(|p| p.retention_time.unwrap_or_default()).collect(), ), - PropertyType::CCS => PredictionResult::IMResult( + PropertyType::CCS => PredictionResult::CCSResult( batch_data.iter().map(|p| p.ion_mobility.unwrap_or_default()).collect(), ), PropertyType::MS2 => { @@ -557,7 +557,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { }; let target_val = match val_targets { - PredictionResult::RTResult(ref values) | PredictionResult::IMResult(ref values) => { + PredictionResult::RTResult(ref values) | PredictionResult::CCSResult(ref values) => { Tensor::new(values.clone(), &self.get_device())? } PredictionResult::MS2Result(_) => unreachable!(), @@ -716,7 +716,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { .map(|p| p.retention_time.unwrap_or_default()) .collect(), ), - PropertyType::CCS => PredictionResult::IMResult( + PropertyType::CCS => PredictionResult::CCSResult( batch_data .iter() .map(|p| p.ion_mobility.unwrap_or_default()) @@ -732,7 +732,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { let target_batch = match batch_targets { PredictionResult::RTResult(ref values) - | PredictionResult::IMResult(ref values) => { + | PredictionResult::CCSResult(ref values) => { Tensor::new(values.clone(), &self.get_device())? } PredictionResult::MS2Result(ref spectra) => { From a6d944fbd0713c88384c3e7f643d2bc922d303d6 Mon Sep 17 00:00:00 2001 From: singjc Date: Sun, 11 May 2025 00:14:59 -0400 Subject: [PATCH 32/75] refactor: Add new fields to load_peptide_data function in redeem-cli --- crates/redeem-cli/src/properties/load_data.rs | 39 +++++++++++++------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/crates/redeem-cli/src/properties/load_data.rs b/crates/redeem-cli/src/properties/load_data.rs index 0c423ef..6e85e22 100644 --- a/crates/redeem-cli/src/properties/load_data.rs +++ b/crates/redeem-cli/src/properties/load_data.rs @@ -34,27 +34,39 @@ pub fn load_peptide_data>( let record = result?; let sequence = record - .get(headers.iter().position(|h| h == "sequence").unwrap_or(2)) + .get(headers.iter().position(|h| h.to_lowercase() == "sequence").unwrap_or(2)) .unwrap_or("") .to_string(); let retention_time = record - .get(headers.iter().position(|h| h == "retention time").unwrap_or(3)) + .get(headers.iter().position(|h| h.to_lowercase() == "retention time").unwrap_or(3)) .and_then(|s| s.parse::().ok()); let charge = record - .get(headers.iter().position(|h| h == "charge").unwrap_or(usize::MAX)) + .get(headers.iter().position(|h| h.to_lowercase() == "charge").unwrap_or(usize::MAX)) .and_then(|s| s.parse::().ok()); + let precursor_mass = record + .get(headers.iter().position(|h| h.to_lowercase() == "precursor_mass").unwrap_or(usize::MAX)) + .and_then(|s| s.parse::().ok()); + + let ion_mobility = record + .get(headers.iter().position(|h| h.to_lowercase() == "ion_mobility").unwrap_or(usize::MAX)) + .and_then(|s| s.parse::().ok()); + + let ccs = record + .get(headers.iter().position(|h| h.to_lowercase() == "ccs").unwrap_or(usize::MAX)) + .and_then(|s| s.parse::().ok()); + let in_nce = nce.or_else(|| { record - .get(headers.iter().position(|h| h == "nce").unwrap_or(usize::MAX)) + .get(headers.iter().position(|h| h.to_lowercase() == "nce").unwrap_or(usize::MAX)) .and_then(|s| s.parse::().ok()) }); let in_instrument = instrument.clone().or_else(|| { record - .get(headers.iter().position(|h| h == "instrument").unwrap_or(usize::MAX)) + .get(headers.iter().position(|h| h.to_lowercase() == "instrument").unwrap_or(usize::MAX)) .map(|s| s.to_string()) }); @@ -62,15 +74,17 @@ pub fn load_peptide_data>( rt_values.push(rt); } - peptides.push(PeptideData::new( - &sequence, + peptides.push(PeptideData { + sequence, charge, - in_nce, - in_instrument.as_deref(), + precursor_mass, + nce: in_nce, + instrument: in_instrument, retention_time, - None, - None, - )); + ion_mobility, + ccs, + ms2_intensities: None + }); } if normalize_rt && !rt_values.is_empty() { @@ -93,3 +107,4 @@ pub fn load_peptide_data>( Ok((peptides, None)) } } + From b22f28d33f1c430d458c688a68b4db28ddf253a1 Mon Sep 17 00:00:00 2001 From: singjc Date: Sun, 11 May 2025 02:40:40 -0400 Subject: [PATCH 33/75] refactor: Add stats module to redeem-properties crate, and add lr scheduler --- .../src/properties/train/trainer.rs | 59 ++- .../redeem-properties/src/models/ccs_model.rs | 3 +- .../src/models/model_interface.rs | 413 +++++++++++------- .../redeem-properties/src/models/rt_model.rs | 3 +- crates/redeem-properties/src/utils/mod.rs | 3 +- crates/redeem-properties/src/utils/stats.rs | 231 ++++++++++ crates/redeem-properties/src/utils/utils.rs | 49 ++- 7 files changed, 580 insertions(+), 181 deletions(-) create mode 100644 crates/redeem-properties/src/utils/stats.rs diff --git a/crates/redeem-cli/src/properties/train/trainer.rs b/crates/redeem-cli/src/properties/train/trainer.rs index 45ed0fe..2b2695b 100644 --- a/crates/redeem-cli/src/properties/train/trainer.rs +++ b/crates/redeem-cli/src/properties/train/trainer.rs @@ -131,7 +131,7 @@ pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { let start_time = std::time::Instant::now(); log::trace!("Training started"); - let epoch_losses = model.train( + let train_step_metrics = model.train( &train_peptides, val_peptides.as_ref(), modifications.clone(), @@ -142,13 +142,15 @@ pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { config.early_stopping_patience, )?; log::info!("Training completed in {:?}", start_time.elapsed()); + model.save(&config.output_file)?; + log::info!("Model saved to: {}", config.output_file); // Generate report let mut report = Report::new( "ReDeeM", &config.version, Some("https://github.com/singjc/redeem/blob/master/img/redeem_logo.png?raw=true"), - "ReDeeM Trainer Report", + format!("ReDeeM (:?) Trainer Report", config.model_arch).as_str(), ); /* Section 1: Overview */ @@ -156,13 +158,44 @@ pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { let mut overview_section = ReportSection::new("Overview"); overview_section.add_content(html! { - "This report summarizes the training process of the ReDeeM model." + "This report summarizes the training process of the ReDeeM model. It includes epoch-level summaries and step-wise dynamics such as learning rate scheduling and accuracy tracking over time. These plots provide insight into model convergence behavior and training stability." }); + let epoch_losses = train_step_metrics.summarize_loss_for_plotting(); let losses_plot = plot_losses(&epoch_losses); overview_section.add_plot(losses_plot); - // Lets perform inference on 1000 random samples from the validation set + // Step-wise learning rate plot + let lr_plot = plot_training_metric( + &train_step_metrics, + "lr", + "Learning Rate Over Steps", + "Step", + "Learning Rate", + ); + overview_section.add_plot(lr_plot); + + // Step-wise loss plot + let step_loss_plot = plot_training_metric( + &train_step_metrics, + "loss", + "Loss Over Steps", + "Step", + "Loss", + ); + overview_section.add_plot(step_loss_plot); + + // Step-wise accuracy plot + let acc_plot = plot_training_metric( + &train_step_metrics, + "accuracy", + "Accuracy Over Steps", + "Step", + "Accuracy", + ); + overview_section.add_plot(acc_plot); + + // Inference scatter plot let val_peptides: Vec = sample_peptides(&val_peptides.as_ref().unwrap(), 1000); let inference_results: Vec = model.inference(&val_peptides, config.batch_size, modifications, norm_factor)?; @@ -173,8 +206,8 @@ pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { match (true_pep.retention_time, pred_pep.retention_time) { (Some(t), Some(p)) => { let t_denorm = t as f64 * norm_factor.unwrap().1 as f64 - + norm_factor.unwrap().0 as f64; // de-normalized true RT - Some((t_denorm, p as f64)) // assume predicted is already de-normalized + + norm_factor.unwrap().0 as f64; + Some((t_denorm, p as f64)) } _ => None, } @@ -184,16 +217,18 @@ pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { let scatter_plot = plot_scatter( &vec![true_rt.clone()], &vec![pred_rt.clone()], - vec!["RT Prediction".to_string()], - "Predicted vs True RT", - "Target RT", - "Predicted RT", + vec!["Prediction".to_string()], + "Predicted vs True (Random 1000 Validation Peptides)", + "Target", + "Predicted", ) .unwrap(); overview_section.add_plot(scatter_plot); + report.add_section(overview_section); } + /* Section 2: Configuration */ { let mut config_section = ReportSection::new("Configuration"); @@ -221,9 +256,7 @@ pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { let path = "redeem_trainer_report.html"; report.save_to_file(&path.to_string())?; - model.save(&config.output_file)?; - log::info!("Model saved to: {}", config.output_file); - + // Save configuration to JSON file let path = "redeem_trainer_config.json"; let json = serde_json::to_string_pretty(&config)?; println!("{}", json); diff --git a/crates/redeem-properties/src/models/ccs_model.rs b/crates/redeem-properties/src/models/ccs_model.rs index 7befcf3..43b5c87 100644 --- a/crates/redeem-properties/src/models/ccs_model.rs +++ b/crates/redeem-properties/src/models/ccs_model.rs @@ -5,6 +5,7 @@ use crate::models::model_interface::{ModelInterface,PredictionResult}; use crate::models::ccs_cnn_lstm_model::CCSCNNLSTMModel; use crate::models::ccs_cnn_tf_model::CCSCNNTFModel; use crate::utils::data_handling::PeptideData; +use crate::utils::stats::TrainingStepMetrics; use std::collections::HashMap; use crate::utils::peptdeep_utils::ModificationMap; @@ -45,7 +46,7 @@ impl CCSModelWrapper { self.model.predict(peptide_sequence, mods, mod_sites, Some(charge), None, None) } - pub fn train(&mut self, training_data: &Vec, val_data: Option<&Vec>, modifications: HashMap<(String, Option), ModificationMap>, batch_size: usize, val_batch_size: usize, learning_rate: f64, epochs: usize, early_stopping_patience: usize) -> Result, f32, Option)>> { + pub fn train(&mut self, training_data: &Vec, val_data: Option<&Vec>, modifications: HashMap<(String, Option), ModificationMap>, batch_size: usize, val_batch_size: usize, learning_rate: f64, epochs: usize, early_stopping_patience: usize) -> Result { self.model.train(training_data, val_data, modifications, batch_size, val_batch_size, learning_rate, epochs, early_stopping_patience) } diff --git a/crates/redeem-properties/src/models/model_interface.rs b/crates/redeem-properties/src/models/model_interface.rs index 9cb4f84..9d3f1ef 100644 --- a/crates/redeem-properties/src/models/model_interface.rs +++ b/crates/redeem-properties/src/models/model_interface.rs @@ -2,12 +2,10 @@ use crate::{ building_blocks::featurize::{self, aa_indices_tensor, get_mod_features_from_parsed}, models::{ccs_model::CCSModelWrapper, ms2_model::MS2ModelWrapper, rt_model::RTModelWrapper}, utils::{ - data_handling::PeptideData, - logging::Progress, - peptdeep_utils::{ + data_handling::PeptideData, logging::Progress, peptdeep_utils::{ get_modification_indices, get_modification_string, parse_instrument_index, remove_mass_shift, - }, + }, stats::{compute_loss_stats, Metrics, TrainingPhase, TrainingStepMetrics}, utils::{CosineWithWarmup, LRScheduler} }, }; use anyhow::{Context, Result}; @@ -210,11 +208,11 @@ pub trait ModelInterface: Send + Sync + ModelClone { Self: Sized; /// Create a new instance of the model (given a pretrained model (.pth or .safetensors and constants file). - /// + /// /// # Arguments /// * `model_path` - Path to the model file (.pth or .safetensors). /// * `constants_path` - Optional path to the model constants file (.yaml). If none, will use the default constants. - /// + /// fn new>( model_path: P, constants_path: Option

, @@ -252,9 +250,9 @@ pub trait ModelInterface: Send + Sync + ModelClone { instrument: Option>, ) -> Result { // Encode the batch of peptides - let input_tensor = - self.encode_peptides(peptide_sequences, mods, mod_sites, charge, nce, instrument)? - .to_device(self.get_device())?; + let input_tensor = self + .encode_peptides(peptide_sequences, mods, mod_sites, charge, nce, instrument)? + .to_device(self.get_device())?; // Forward pass through the model let output = self.forward(&input_tensor)?; @@ -443,9 +441,11 @@ pub trait ModelInterface: Send + Sync + ModelClone { learning_rate: f64, epochs: usize, early_stopping_patience: usize, - ) -> Result, f32, Option)>> { + ) -> Result { let num_batches = (training_data.len() + batch_size - 1) / batch_size; - + let total_steps = num_batches * epochs; + let warmup_steps = total_steps / 10; // 10% of total steps + info!( "Training {} model from on {} peptide features ({} batches) for {} epochs", self.get_model_arch(), @@ -453,138 +453,158 @@ pub trait ModelInterface: Send + Sync + ModelClone { num_batches, epochs ); - + + let mut step_metrics = TrainingStepMetrics { + epochs: vec![], + steps: vec![], + learning_rates: vec![], + losses: vec![], + phases: vec![], + precisions: vec![], + recalls: vec![], + accuracies: vec![], + }; + + let mut step_idx = 0; + let mut val_step_idx = 0; + let params = candle_nn::ParamsAdamW { lr: learning_rate, ..Default::default() }; let mut opt = candle_nn::AdamW::new(self.get_mut_varmap().all_vars(), params)?; - + let mut lr_scheduler = CosineWithWarmup::new( + learning_rate, + warmup_steps, + total_steps, + 0.5 // one full cosine cycle + ); + let mut best_val_loss = f32::INFINITY; let mut epochs_without_improvement = 0; let mut epoch_losses = vec![]; - + for epoch in 0..epochs { let progress = Progress::new(num_batches, &format!("[training] Epoch {}: ", epoch)); let mut batch_losses = vec![]; - - training_data - .chunks(batch_size) - .enumerate() - .try_for_each(|(_batch_idx, batch_data)| { - let peptides: Vec = batch_data.iter().map(|p| remove_mass_shift(&p.sequence)).collect(); - let mods: Vec = batch_data.iter().map(|p| get_modification_string(&p.sequence, &modifications)).collect(); - let mod_sites: Vec = batch_data.iter().map(|p| get_modification_indices(&p.sequence)).collect(); - - let charges = batch_data.iter().filter_map(|p| p.charge).collect::>(); - let charges = if charges.len() == batch_data.len() { Some(charges) } else { None }; - - let nces = batch_data.iter().filter_map(|p| p.nce).collect::>(); - let nces = if nces.len() == batch_data.len() { Some(nces) } else { None }; - - let instruments = batch_data.iter().filter_map(|p| p.instrument.clone()).collect::>(); - let instruments = if instruments.len() == batch_data.len() { Some(instruments) } else { None }; - - let input_batch = self.encode_peptides(&peptides, &mods, &mod_sites, charges, nces, instruments)?.to_device(self.get_device())?; - - let batch_targets = match self.property_type() { - PropertyType::RT => PredictionResult::RTResult( - batch_data.iter().map(|p| p.retention_time.unwrap_or_default()).collect(), - ), - PropertyType::CCS => PredictionResult::CCSResult( - batch_data.iter().map(|p| p.ion_mobility.unwrap_or_default()).collect(), - ), - PropertyType::MS2 => { - return Err(anyhow::anyhow!("Training from scratch is not yet implemented for MS2")); - } - }; - - let target_batch = match batch_targets { - PredictionResult::RTResult(ref values) | PredictionResult::CCSResult(ref values) => { - Tensor::new(values.clone(), &self.get_device())? - } - PredictionResult::MS2Result(_) => unreachable!(), - }.to_device(self.get_device())?; - + + training_data.chunks(batch_size).enumerate().try_for_each( + |(_batch_idx, batch_data)| -> anyhow::Result<()> { + let (input_batch, target_batch) = + self.prepare_batch_inputs(batch_data, &modifications)?; + let predicted = self.forward(&input_batch)?; let loss = candle_nn::loss::mse(&predicted, &target_batch)?; opt.backward_step(&loss)?; - + + // Update learning rate after optimizer step + opt.set_learning_rate(lr_scheduler.get_last_lr()); + lr_scheduler.step(); + let loss_val = loss.to_vec0::().unwrap_or(999.0); batch_losses.push(loss_val); - - progress.update_description(&format!("[training] Epoch {}: Loss: {:.4}", epoch, loss_val)); + + let predictions = predicted.to_vec1::()?; + let targets = target_batch.to_vec1::()?; + + let acc = match self.property_type() { + PropertyType::RT => Some(Metrics::accuracy(&predictions, &targets, 0.5)), // is predicted RT within 0.5 min of target RT? + PropertyType::CCS => { + let tol: Vec = targets.iter().map(|t| t * 0.02).collect(); + Some(Metrics::accuracy_dynamic(&predictions, &targets, &tol)) + }, // is predicted CCS within 2% of target CCS? + _ => None, + }; + + step_metrics.epochs.push(epoch); + step_metrics.steps.push(step_idx); + step_metrics.learning_rates.push(lr_scheduler.get_last_lr() as f64); + step_metrics.losses.push(loss_val); + step_metrics.phases.push(TrainingPhase::Train); + step_metrics.accuracies.push(acc); + step_metrics.precisions.push(None); + step_metrics.recalls.push(None); + step_idx += 1; + + progress.update_description(&format!( + "[training] Epoch {}: Loss: {:.4}", + epoch, loss_val + )); progress.inc(); - + Ok(()) - })?; - - let avg_loss = batch_losses.iter().copied().sum::() / batch_losses.len() as f32; - let std_loss = (batch_losses.iter().map(|l| (l - avg_loss).powi(2)).sum::() / batch_losses.len() as f32).sqrt(); - + }, + )?; + + let (avg_loss, std_loss) = compute_loss_stats(&batch_losses); + if let Some(val_data) = validation_data { - let val_batches = (val_data.len() + validation_batch_size - 1) / validation_batch_size; - use rayon::prelude::*; - - let val_losses: Vec = val_data + let val_batches = + (val_data.len() + validation_batch_size - 1) / validation_batch_size; + + let val_results: Vec<(f32, usize, f64, Option)> = val_data .par_chunks(validation_batch_size) - .map(|batch_data| { - let peptides: Vec = batch_data.iter().map(|p| remove_mass_shift(&p.sequence)).collect(); - let mods: Vec = batch_data.iter().map(|p| get_modification_string(&p.sequence, &modifications)).collect(); - let mod_sites: Vec = batch_data.iter().map(|p| get_modification_indices(&p.sequence)).collect(); - - let charges = batch_data.iter().filter_map(|p| p.charge).collect::>(); - let charges = if charges.len() == batch_data.len() { Some(charges) } else { None }; - - let nces = batch_data.iter().filter_map(|p| p.nce).collect::>(); - let nces = if nces.len() == batch_data.len() { Some(nces) } else { None }; - - let instruments = batch_data.iter().filter_map(|p| p.instrument.clone()).collect::>(); - let instruments = if instruments.len() == batch_data.len() { Some(instruments) } else { None }; - - let input_val = self.encode_peptides(&peptides, &mods, &mod_sites, charges, nces, instruments)?.to_device(self.get_device())?; - - let val_targets = match self.property_type() { - PropertyType::RT => PredictionResult::RTResult( - batch_data.iter().map(|p| p.retention_time.unwrap_or_default()).collect(), - ), - PropertyType::CCS => PredictionResult::CCSResult( - batch_data.iter().map(|p| p.ion_mobility.unwrap_or_default()).collect(), - ), - PropertyType::MS2 => { - return Err(anyhow::anyhow!("Validation not supported for MS2 yet")); - } - }; - - let target_val = match val_targets { - PredictionResult::RTResult(ref values) | PredictionResult::CCSResult(ref values) => { - Tensor::new(values.clone(), &self.get_device())? - } - PredictionResult::MS2Result(_) => unreachable!(), - }.to_device(self.get_device())?; - + .enumerate() + .map(|(idx, batch_data)| { + let (input_val, target_val) = self.prepare_batch_inputs(batch_data, &modifications)?; let predicted = self.forward(&input_val)?; let val_loss = candle_nn::loss::mse(&predicted, &target_val)?; - Ok(val_loss.to_vec0::()?) + let loss_val = val_loss.to_vec0::()?; + + let predictions = predicted.to_vec1::()?; + let targets = target_val.to_vec1::()?; + + let acc = match self.property_type() { + PropertyType::RT => Some(Metrics::accuracy(&predictions, &targets, 0.5)), + PropertyType::CCS => { + let tol: Vec = targets.iter().map(|t| t * 0.02).collect(); + Some(Metrics::accuracy_dynamic(&predictions, &targets, &tol)) + }, + _ => None, + }; + + Ok((loss_val, idx, lr_scheduler.get_last_lr(), acc)) }) - .collect::>>()?; - - let avg_val_loss = val_losses.iter().sum::() / val_losses.len() as f32; - let std_val_loss = (val_losses.iter().map(|l| (l - avg_val_loss).powi(2)).sum::() / val_losses.len() as f32).sqrt(); - - epoch_losses.push((epoch, avg_loss, Some(avg_val_loss), std_loss, Some(std_val_loss))); - + .collect::>()?; + + for (val_loss, idx, lr, acc) in &val_results { + step_metrics.epochs.push(epoch); + step_metrics.steps.push(val_step_idx + idx); + step_metrics.learning_rates.push(*lr); + step_metrics.losses.push(*val_loss); + step_metrics.phases.push(TrainingPhase::Validation); + step_metrics.accuracies.push(*acc); + step_metrics.precisions.push(None); + step_metrics.recalls.push(None); + } + val_step_idx += val_results.len(); + + let val_losses: Vec = val_results.iter().map(|(loss, _, _, _)| *loss).collect(); + let (avg_val_loss, std_val_loss): (f32, f32) = compute_loss_stats(&val_losses); + + epoch_losses.push(( + epoch, + avg_loss, + Some(avg_val_loss), + std_loss, + Some(std_val_loss), + )); + progress.update_description(&format!( "Epoch {}: Avg. Train Loss: {:.4} (±{:.4}) | Avg. Val. Loss: {:.4} (±{:.4})", epoch, avg_loss, std_loss, avg_val_loss, std_val_loss )); progress.finish(); - + if avg_val_loss < best_val_loss { best_val_loss = avg_val_loss; epochs_without_improvement = 0; - - let checkpoint_path = format!("redeem_{}_ckpt_model_epoch_{}.safetensors", self.get_model_arch(), epoch); + + let checkpoint_path = format!( + "redeem_{}_best_val_ckpt_model_epoch_{}.safetensors", + self.get_model_arch(), + epoch + ); self.get_mut_varmap().save(&checkpoint_path)?; } else { epochs_without_improvement += 1; @@ -592,20 +612,52 @@ pub trait ModelInterface: Send + Sync + ModelClone { info!("Early stopping triggered after {} epochs without validation loss improvement.", early_stopping_patience); return Ok(epoch_losses); } + let checkpoint_path = format!( + "redeem_{}_ckpt_model_epoch_{}.safetensors", + self.get_model_arch(), + epoch- 1 + ); + // Check if the prior checkpoint exists, if it does delete it + if PathBuf::from(&checkpoint_path).exists() { + std::fs::remove_file(&checkpoint_path)?; + } + // Save the current checkpoint + let checkpoint_path = format!( + "redeem_{}_ckpt_model_epoch_{}.safetensors", + self.get_model_arch(), + epoch + ); + self.get_mut_varmap().save(&checkpoint_path)?; } } else { epoch_losses.push((epoch, avg_loss, None, std_loss, None)); - progress.update_description(&format!("Epoch {}: Avg. Train Loss: {:.4} (±{:.4})", epoch, avg_loss, std_loss)); + progress.update_description(&format!( + "Epoch {}: Avg. Train Loss: {:.4} (±{:.4})", + epoch, avg_loss, std_loss + )); progress.finish(); - - let checkpoint_path = format!("redeem_{}_ckpt_model_epoch_{}.safetensors", self.get_model_arch(), epoch); + + let checkpoint_path = format!( + "redeem_{}_ckpt_model_epoch_{}.safetensors", + self.get_model_arch(), + epoch- 1 + ); + // Check if the prior checkpoint exists, if it does delete it + if PathBuf::from(&checkpoint_path).exists() { + std::fs::remove_file(&checkpoint_path)?; + } + // Save the current checkpoint + let checkpoint_path = format!( + "redeem_{}_ckpt_model_epoch_{}.safetensors", + self.get_model_arch(), + epoch + ); self.get_mut_varmap().save(&checkpoint_path)?; } } - - Ok(epoch_losses) + + Ok(step_metrics) } - /// Fine-tune the model on a batch of training data. /// @@ -700,8 +752,9 @@ pub trait ModelInterface: Send + Sync + ModelClone { None }; - let input_batch = - self.encode_peptides(&peptides, &mods, &mod_sites, charges, nces, instruments)?.to_device(self.get_device())?; + let input_batch = self + .encode_peptides(&peptides, &mods, &mod_sites, charges, nces, instruments)? + .to_device(self.get_device())?; log::trace!( "[ModelInterface::fine_tune] input_batch shape: {:?}, device: {:?}", @@ -752,7 +805,8 @@ pub trait ModelInterface: Send + Sync + ModelClone { feature_dim, ))? } - }.to_device(self.get_device())?; + } + .to_device(self.get_device())?; let predicted = self.forward(&input_batch)?; let loss = candle_nn::loss::mse(&predicted, &target_batch)?; @@ -795,32 +849,61 @@ pub trait ModelInterface: Send + Sync + ModelClone { inference_data.len(), num_batches ); - + let progress = Progress::new(inference_data.len(), "[inference] Batch:"); let mut result: Vec> = vec![None; inference_data.len()]; - + inference_data .par_chunks(batch_size) .enumerate() .map(|(batch_idx, batch_data)| { let start_idx = batch_idx * batch_size; - - let peptides: Vec = batch_data.iter().map(|p| remove_mass_shift(&p.sequence)).collect(); - let mods: Vec = batch_data.iter().map(|p| get_modification_string(&p.sequence, &modifications)).collect(); - let mod_sites: Vec = batch_data.iter().map(|p| get_modification_indices(&p.sequence)).collect(); - - let charges = batch_data.iter().filter_map(|p| p.charge).collect::>(); - let charges = if charges.len() == batch_data.len() { Some(charges) } else { None }; - + + let peptides: Vec = batch_data + .iter() + .map(|p| remove_mass_shift(&p.sequence)) + .collect(); + let mods: Vec = batch_data + .iter() + .map(|p| get_modification_string(&p.sequence, &modifications)) + .collect(); + let mod_sites: Vec = batch_data + .iter() + .map(|p| get_modification_indices(&p.sequence)) + .collect(); + + let charges = batch_data + .iter() + .filter_map(|p| p.charge) + .collect::>(); + let charges = if charges.len() == batch_data.len() { + Some(charges) + } else { + None + }; + let nces = batch_data.iter().filter_map(|p| p.nce).collect::>(); - let nces = if nces.len() == batch_data.len() { Some(nces) } else { None }; - - let instruments = batch_data.iter().filter_map(|p| p.instrument.clone()).collect::>(); - let instruments = if instruments.len() == batch_data.len() { Some(instruments) } else { None }; - - let input_tensor = self.encode_peptides(&peptides, &mods, &mod_sites, charges, nces, instruments)?.to_device(self.get_device())?; + let nces = if nces.len() == batch_data.len() { + Some(nces) + } else { + None + }; + + let instruments = batch_data + .iter() + .filter_map(|p| p.instrument.clone()) + .collect::>(); + let instruments = if instruments.len() == batch_data.len() { + Some(instruments) + } else { + None + }; + + let input_tensor = self + .encode_peptides(&peptides, &mods, &mod_sites, charges, nces, instruments)? + .to_device(self.get_device())?; let output = self.forward(&input_tensor)?; - + match self.property_type() { PropertyType::RT | PropertyType::CCS => { let predictions = output.to_vec1()?; @@ -831,21 +914,24 @@ pub trait ModelInterface: Send + Sync + ModelClone { let mut peptide = batch_data[i].clone(); match self.property_type() { PropertyType::RT => { - peptide.retention_time = if let Some((mean, std)) = rt_norm_params { - Some(pred * std + mean) - } else { - Some(pred) - }; + peptide.retention_time = + if let Some((mean, std)) = rt_norm_params { + Some(pred * std + mean) + } else { + Some(pred) + }; } PropertyType::CCS => peptide.ion_mobility = Some(pred), _ => {} - }; + }; (start_idx + i, peptide) }) .collect(); Ok(updated) } - PropertyType::MS2 => Err(anyhow::anyhow!("Inference not supported for MS2 models in batch mode")), + PropertyType::MS2 => Err(anyhow::anyhow!( + "Inference not supported for MS2 models in batch mode" + )), } }) .collect::>>>()? @@ -855,12 +941,41 @@ pub trait ModelInterface: Send + Sync + ModelClone { result[idx] = Some(peptide); progress.inc(); }); - + progress.finish(); Ok(result.into_iter().flatten().collect()) } - - + + /// Extract encoded input and target tensor for a batch of peptides. + fn prepare_batch_inputs( + &self, + batch_data: &[PeptideData], + modifications: &HashMap<(String, Option), crate::utils::peptdeep_utils::ModificationMap>, + ) -> Result<(Tensor, Tensor)> { + let peptides: Vec = batch_data.par_iter().map(|p| remove_mass_shift(&p.sequence)).collect(); + let mods: Vec = batch_data.par_iter().map(|p| get_modification_string(&p.sequence, modifications)).collect(); + let mod_sites: Vec = batch_data.par_iter().map(|p| get_modification_indices(&p.sequence)).collect(); + + let charges = batch_data.par_iter().filter_map(|p| p.charge).collect::>(); + let charges = if charges.len() == batch_data.len() { Some(charges) } else { None }; + + let nces = batch_data.par_iter().filter_map(|p| p.nce).collect::>(); + let nces = if nces.len() == batch_data.len() { Some(nces) } else { None }; + + let instruments = batch_data.par_iter().filter_map(|p| p.instrument.clone()).collect::>(); + let instruments = if instruments.len() == batch_data.len() { Some(instruments) } else { None }; + + let input_batch = self.encode_peptides(&peptides, &mods, &mod_sites, charges, nces, instruments)?.to_device(self.get_device())?; + + let target_values: Vec = match self.property_type() { + PropertyType::RT => batch_data.par_iter().map(|p| p.retention_time.unwrap_or_default()).collect(), + PropertyType::CCS => batch_data.par_iter().map(|p| p.ion_mobility.unwrap_or_default()).collect(), + PropertyType::MS2 => return Err(anyhow::anyhow!("MS2 training is not yet implemented")), + }; + + let target_tensor = Tensor::new(target_values, &self.get_device())?; + Ok((input_batch, target_tensor)) + } /// Set model to evaluation mode for inference /// This disables dropout and other training-specific layers. @@ -936,6 +1051,8 @@ pub trait ModelInterface: Send + Sync + ModelClone { } } + + /// Parameters for the `predict` method of a `ModelInterface` implementation. #[derive(Clone)] pub struct Parameters { diff --git a/crates/redeem-properties/src/models/rt_model.rs b/crates/redeem-properties/src/models/rt_model.rs index f4a9643..3b9672c 100644 --- a/crates/redeem-properties/src/models/rt_model.rs +++ b/crates/redeem-properties/src/models/rt_model.rs @@ -9,6 +9,7 @@ use crate::models::model_interface::{ModelInterface,PredictionResult}; use crate::models::rt_cnn_lstm_model::RTCNNLSTMModel; use crate::models::rt_cnn_transformer_model::RTCNNTFModel; use crate::utils::data_handling::PeptideData; +use crate::utils::stats::TrainingStepMetrics; use std::collections::HashMap; use crate::utils::peptdeep_utils::ModificationMap; @@ -50,7 +51,7 @@ impl RTModelWrapper { self.model.predict(peptide_sequence, mods, mod_sites, None, None, None) } - pub fn train(&mut self, training_data: &Vec, val_data: Option<&Vec>, modifications: HashMap<(String, Option), ModificationMap>, batch_size: usize, val_batch_size: usize, learning_rate: f64, epochs: usize, early_stopping_patience: usize) -> Result, f32, Option)>> { + pub fn train(&mut self, training_data: &Vec, val_data: Option<&Vec>, modifications: HashMap<(String, Option), ModificationMap>, batch_size: usize, val_batch_size: usize, learning_rate: f64, epochs: usize, early_stopping_patience: usize) -> Result { self.model.train(training_data, val_data, modifications, batch_size, val_batch_size, learning_rate, epochs, early_stopping_patience) } diff --git a/crates/redeem-properties/src/utils/mod.rs b/crates/redeem-properties/src/utils/mod.rs index 10069ff..139fa1b 100644 --- a/crates/redeem-properties/src/utils/mod.rs +++ b/crates/redeem-properties/src/utils/mod.rs @@ -1,4 +1,5 @@ pub mod peptdeep_utils; pub mod logging; pub mod utils; -pub mod data_handling; \ No newline at end of file +pub mod data_handling; +pub mod stats; \ No newline at end of file diff --git a/crates/redeem-properties/src/utils/stats.rs b/crates/redeem-properties/src/utils/stats.rs new file mode 100644 index 0000000..da2318a --- /dev/null +++ b/crates/redeem-properties/src/utils/stats.rs @@ -0,0 +1,231 @@ +/// Represents a single phase of training: either Training or Validation. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum TrainingPhase { + Train, + Validation, +} + +/// Stores step-wise metrics for all training/validation iterations in a Struct of Arrays layout. +#[derive(Debug, Clone)] +pub struct TrainingStepMetrics { + pub epochs: Vec, + pub steps: Vec, + pub learning_rates: Vec, + pub losses: Vec, + pub phases: Vec, + pub precisions: Vec>, + pub recalls: Vec>, + pub accuracies: Vec>, +} + +impl TrainingStepMetrics { + /// Computes the average and standard deviation of loss values grouped by epoch and training phase. + /// + /// # Returns + /// A `HashMap` where each key is a tuple `(epoch, TrainingPhase)` and each value is a tuple `(avg_loss, std_loss)`. + /// This can be used for reporting or plotting epoch-level training and validation loss trends. + pub fn summarize_by_epoch_phase( + &self, + ) -> std::collections::HashMap<(usize, TrainingPhase), (f32, f32)> { + use std::collections::HashMap; + + let mut grouped: HashMap<(usize, TrainingPhase), Vec> = HashMap::new(); + + for i in 0..self.epochs.len() { + let key = (self.epochs[i], self.phases[i].clone()); + grouped.entry(key).or_default().push(self.losses[i]); + } + + let mut summary = HashMap::new(); + for (key, values) in grouped { + let avg = values.iter().copied().sum::() / values.len() as f32; + let std = (values.iter().map(|v| (v - avg).powi(2)).sum::() / values.len() as f32) + .sqrt(); + summary.insert(key, (avg, std)); // insert avg/std loss for this epoch + phase + } + + summary + } + + /// Summarizes average and std loss per epoch for training and validation phases. + /// + /// Returns a vector of tuples: + /// (epoch, avg_train_loss, avg_val_loss, std_train_loss, std_val_loss) + pub fn summarize_loss_for_plotting(&self) -> Vec<(usize, f32, Option, f32, Option)> { + use std::collections::HashMap; + + let mut train_map: HashMap> = HashMap::new(); + let mut val_map: HashMap> = HashMap::new(); + + for i in 0..self.epochs.len() { + match self.phases[i] { + TrainingPhase::Train => train_map.entry(self.epochs[i]).or_default().push(self.losses[i]), + TrainingPhase::Validation => val_map.entry(self.epochs[i]).or_default().push(self.losses[i]), + } + } + + let mut epochs: Vec<_> = train_map.keys().chain(val_map.keys()).copied().collect(); + epochs.sort_unstable(); + epochs.dedup(); + + epochs + .into_iter() + .map(|epoch| { + let (avg_train, std_train) = train_map.get(&epoch) + .map(|v| compute_loss_stats(v)) + .unwrap_or((f32::NAN, f32::NAN)); + let (avg_val, std_val) = val_map.get(&epoch) + .map(|v| compute_loss_stats(v)) + .map_or((None, None), |(avg, std)| (Some(avg), Some(std))); + + (epoch, avg_train, avg_val, std_train, std_val) + }) + .collect() + } + + /// Computes the average and standard deviation of precision, recall, and accuracy values grouped by epoch and training phase. + /// + /// # Returns + /// A `HashMap` where each key is a tuple `(epoch, TrainingPhase)` and each value is a tuple of: + /// `(avg_precision, std_precision, avg_recall, std_recall, avg_accuracy, std_accuracy)`. + pub fn summarize_metrics_by_epoch_phase( + &self, + ) -> std::collections::HashMap< + (usize, TrainingPhase), + ( + Option, + Option, + Option, + Option, + Option, + Option, + ), + > { + use std::collections::{HashMap, HashSet}; + + let mut prec_map: HashMap<(usize, TrainingPhase), Vec> = HashMap::new(); + let mut rec_map: HashMap<(usize, TrainingPhase), Vec> = HashMap::new(); + let mut acc_map: HashMap<(usize, TrainingPhase), Vec> = HashMap::new(); + + for i in 0..self.epochs.len() { + let key = (self.epochs[i], self.phases[i].clone()); + if let Some(p) = self.precisions[i] { + prec_map.entry(key.clone()).or_default().push(p); + } + if let Some(r) = self.recalls[i] { + rec_map.entry(key.clone()).or_default().push(r); + } + if let Some(a) = self.accuracies[i] { + acc_map.entry(key.clone()).or_default().push(a); + } + } + + let mut result = HashMap::new(); + let keys: HashSet<_> = self + .epochs + .iter() + .zip(&self.phases) + .map(|(e, p)| (*e, p.clone())) + .collect(); + + let summarize = |vals: &Vec| { + let avg = vals.iter().copied().sum::() / vals.len() as f32; + let std = + (vals.iter().map(|v| (v - avg).powi(2)).sum::() / vals.len() as f32).sqrt(); + (avg, std) + }; + + for key in keys { + let (prec_avg, prec_std) = prec_map + .get(&key) + .map(summarize) + .map_or((None, None), |(a, s)| (Some(a), Some(s))); + let (rec_avg, rec_std) = rec_map + .get(&key) + .map(summarize) + .map_or((None, None), |(a, s)| (Some(a), Some(s))); + let (acc_avg, acc_std) = acc_map + .get(&key) + .map(summarize) + .map_or((None, None), |(a, s)| (Some(a), Some(s))); + + result.insert( + key, + (prec_avg, prec_std, rec_avg, rec_std, acc_avg, acc_std), + ); + } + + result + } +} + + +/// Utility functions for evaluating prediction metrics. +pub struct Metrics; + +impl Metrics { + /// Computes accuracy as the proportion of predictions within a tolerance of the target. + pub fn accuracy(pred: &[f32], target: &[f32], tolerance: f32) -> f32 { + let correct = pred.iter().zip(target).filter(|(p, t)| (*p - *t).abs() <= tolerance).count(); + correct as f32 / pred.len() as f32 + } + + /// Computes accuracy as the proportion of predictions within a dynamic tolerance of the target. + pub fn accuracy_dynamic(pred: &[f32], target: &[f32], tolerance: &[f32]) -> f32 { + pred.iter() + .zip(target) + .zip(tolerance) + .filter(|((p, t), tol)| (*p - *t).abs() <= **tol) + .count() as f32 / pred.len() as f32 + } + + /// Computes precision as TP / (TP + FP), based on a binary threshold. + pub fn precision(pred: &[f32], target: &[f32], threshold: f32) -> Option { + let mut tp = 0; + let mut fp = 0; + for (&p, &t) in pred.iter().zip(target) { + if p > threshold { + if t > threshold { + tp += 1; + } else { + fp += 1; + } + } + } + if tp + fp > 0 { + Some(tp as f32 / (tp + fp) as f32) + } else { + None + } + } + + /// Computes recall as TP / (TP + FN), based on a binary threshold. + pub fn recall(pred: &[f32], target: &[f32], threshold: f32) -> Option { + let mut tp = 0; + let mut fn_ = 0; + for (&p, &t) in pred.iter().zip(target) { + if t > threshold { + if p > threshold { + tp += 1; + } else { + fn_ += 1; + } + } + } + if tp + fn_ > 0 { + Some(tp as f32 / (tp + fn_) as f32) + } else { + None + } + } +} + + +/// Compute average and std deviation from a slice of loss values. +pub fn compute_loss_stats(losses: &[f32]) -> (f32, f32) +{ + + let avg = losses.iter().copied().sum::() / losses.len() as f32; + let std = (losses.iter().map(|l| (l - avg).powi(2)).sum::() / losses.len() as f32).sqrt(); + (avg, std) +} \ No newline at end of file diff --git a/crates/redeem-properties/src/utils/utils.rs b/crates/redeem-properties/src/utils/utils.rs index 5b21291..98824a6 100644 --- a/crates/redeem-properties/src/utils/utils.rs +++ b/crates/redeem-properties/src/utils/utils.rs @@ -3,14 +3,27 @@ use candle_core::utils::{cuda_is_available, metal_is_available}; use anyhow::{Result, anyhow}; use std::f64::consts::PI; +// Learning rate scheduler trait +/// Trait representing a learning rate scheduler that can be updated each step +/// and queried for the current learning rate. pub trait LRScheduler { - /// Update the learning rate based on the current step fn step(&mut self); - - /// Get the current learning rate fn get_last_lr(&self) -> f64; } +// Cosine decay with warmup +/// Cosine learning rate scheduler with linear warmup phase. +/// +/// This scheduler increases the learning rate linearly from 0 to `initial_lr` +/// over `num_warmup_steps`, then decays it using cosine annealing over the +/// remaining training steps, optionally over multiple cycles. +/// +/// # Fields +/// * `initial_lr` - The peak learning rate after warmup. +/// * `current_step` - Internal counter of the current step. +/// * `num_warmup_steps` - Number of steps to warm up the learning rate. +/// * `num_training_steps` - Total number of training steps. +/// * `num_cycles` - Number of cosine cycles in the annealing phase. pub struct CosineWithWarmup { initial_lr: f64, current_step: usize, @@ -20,32 +33,33 @@ pub struct CosineWithWarmup { } impl CosineWithWarmup { - pub fn new( - initial_lr: f64, - num_warmup_steps: usize, - num_training_steps: usize, - num_cycles: f64, - ) -> Self { + /// Create a new `CosineWithWarmup` scheduler. + /// + /// # Arguments + /// * `initial_lr` - Maximum learning rate after warmup. + /// * `num_warmup_steps` - Number of steps to linearly increase the learning rate. + /// * `num_training_steps` - Total number of training steps. + /// * `num_cycles` - Number of cosine cycles during decay. + pub fn new(initial_lr: f64, num_warmup_steps: usize, num_training_steps: usize, num_cycles: f64) -> Self { Self { initial_lr, current_step: 0, - num_warmup_steps: num_warmup_steps, + num_warmup_steps, num_training_steps, num_cycles, } } + /// Computes the learning rate for the current step based on warmup and cosine decay. fn get_lr(&self) -> f64 { if self.current_step < self.num_warmup_steps { - // Linear warmup return self.initial_lr * (self.current_step as f64) / (self.num_warmup_steps as f64); } - let progress = (self.current_step - self.num_warmup_steps) as f64 - / (self.num_training_steps - self.num_warmup_steps) as f64; - - // Cosine decay - let cosine_decay = 0.5 * (1.0 + (PI * self.num_cycles * 2.0 * progress).cos()); + let progress = (self.current_step - self.num_warmup_steps) as f64 + / (self.num_training_steps - self.num_warmup_steps).max(1) as f64; + + let cosine_decay = 0.5 * (1.0 + (std::f64::consts::PI * self.num_cycles * 2.0 * progress).cos()); self.initial_lr * cosine_decay.max(1e-10) } } @@ -54,12 +68,13 @@ impl LRScheduler for CosineWithWarmup { fn step(&mut self) { self.current_step += 1; } - + fn get_last_lr(&self) -> f64 { self.get_lr() } } + /// Converts a device string to a Candle Device. /// /// # Supported Device Strings From 5891b4e3c0f4954c1c76bf4d1b3d97941b295cf9 Mon Sep 17 00:00:00 2001 From: singjc Date: Sun, 11 May 2025 02:40:52 -0400 Subject: [PATCH 34/75] refactor: Add precursor mass field to PeptideData struct in redeem-properties crate --- crates/redeem-properties/src/utils/data_handling.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/crates/redeem-properties/src/utils/data_handling.rs b/crates/redeem-properties/src/utils/data_handling.rs index 11520ce..6b09c27 100644 --- a/crates/redeem-properties/src/utils/data_handling.rs +++ b/crates/redeem-properties/src/utils/data_handling.rs @@ -3,22 +3,26 @@ pub struct PeptideData { pub sequence: String, pub charge: Option, + pub precursor_mass: Option, pub nce: Option, pub instrument: Option, pub retention_time: Option, pub ion_mobility: Option, + pub ccs: Option, pub ms2_intensities: Option>>, } impl PeptideData { - pub fn new(sequence: &str, charge: Option, nce: Option, instrument: Option<&str>, retention_time: Option, ion_mobility: Option, ms2_intensities: Option>>) -> Self { + pub fn new(sequence: &str, charge: Option, precursor_mass: Option, nce: Option, instrument: Option<&str>, retention_time: Option, ion_mobility: Option, ccs: Option, ms2_intensities: Option>>) -> Self { Self { sequence: sequence.to_string(), charge, + precursor_mass, nce, instrument: instrument.map(|s| s.to_string()), retention_time, ion_mobility, + ccs, ms2_intensities } } From 874e441aaf1c0ed46e85ff7599d7e94498f320b9 Mon Sep 17 00:00:00 2001 From: singjc Date: Sun, 11 May 2025 02:41:00 -0400 Subject: [PATCH 35/75] refactor: Add plot_training_metric function to redeem-cli crate --- .../redeem-cli/src/properties/train/plot.rs | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/crates/redeem-cli/src/properties/train/plot.rs b/crates/redeem-cli/src/properties/train/plot.rs index f5b7b80..0f27576 100644 --- a/crates/redeem-cli/src/properties/train/plot.rs +++ b/crates/redeem-cli/src/properties/train/plot.rs @@ -1,5 +1,6 @@ use plotly::{Layout, Plot, Scatter}; use plotly::common::{Fill, Mode, Title}; +use crate::training::{TrainingStepMetrics, TrainingPhase}; pub fn plot_losses( epoch_losses: &[(usize, f32, Option, f32, Option)] @@ -77,3 +78,68 @@ pub fn plot_losses( plot } + + + +/// Plot a single training metric (e.g. loss, learning rate, accuracy) over steps. +pub fn plot_training_metric( + metrics: &TrainingStepMetrics, + metric_name: &str, + title: &str, + x_title: &str, + y_title: &str, +) -> Plot { + let mut plot = Plot::new(); + + let mut train_x = vec![]; + let mut train_y = vec![]; + let mut val_x = vec![]; + let mut val_y = vec![]; + + for i in 0..metrics.steps.len() { + let x = metrics.steps[i] as f64; + let y_opt = match metric_name { + "loss" => Some(metrics.losses[i] as f64), + "lr" => Some(metrics.learning_rates[i]), + "accuracy" => metrics.accuracies[i].map(|a| a as f64), + _ => None, + }; + + if let Some(y) = y_opt { + match metrics.phases[i] { + TrainingPhase::Train => { + train_x.push(x); + train_y.push(y); + } + TrainingPhase::Validation => { + val_x.push(x); + val_y.push(y); + } + } + } + } + + if !train_x.is_empty() { + plot.add_trace( + Scatter::new(train_x.clone(), train_y.clone()) + .mode(Mode::Lines) + .name("Train"), + ); + } + if !val_x.is_empty() { + plot.add_trace( + Scatter::new(val_x.clone(), val_y.clone()) + .mode(Mode::Lines) + .name("Validation"), + ); + } + + plot.set_layout( + Layout::new() + .title(Title::new().text(title)) + .x_axis(plotly::layout::Axis::new().title(x_title)) + .y_axis(plotly::layout::Axis::new().title(y_title)) + ); + + plot +} From 487f6b7c60c7585a198f19e81900b7387146edf4 Mon Sep 17 00:00:00 2001 From: singjc Date: Sun, 11 May 2025 02:43:35 -0400 Subject: [PATCH 36/75] refactor: Update early stopping logic in ModelInterface implementation --- crates/redeem-properties/src/models/model_interface.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/redeem-properties/src/models/model_interface.rs b/crates/redeem-properties/src/models/model_interface.rs index 9d3f1ef..9ea9273 100644 --- a/crates/redeem-properties/src/models/model_interface.rs +++ b/crates/redeem-properties/src/models/model_interface.rs @@ -610,7 +610,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { epochs_without_improvement += 1; if epochs_without_improvement >= early_stopping_patience { info!("Early stopping triggered after {} epochs without validation loss improvement.", early_stopping_patience); - return Ok(epoch_losses); + return Ok(step_metrics); } let checkpoint_path = format!( "redeem_{}_ckpt_model_epoch_{}.safetensors", From 30ad1025640be3e6ac99194d1849004070527477 Mon Sep 17 00:00:00 2001 From: singjc Date: Mon, 12 May 2025 00:08:31 -0400 Subject: [PATCH 37/75] refactor: Update plot_losses function in redeem-cli crate --- crates/redeem-cli/src/properties/train/plot.rs | 5 +++-- crates/redeem-cli/src/properties/train/trainer.rs | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/crates/redeem-cli/src/properties/train/plot.rs b/crates/redeem-cli/src/properties/train/plot.rs index 0f27576..dab78ff 100644 --- a/crates/redeem-cli/src/properties/train/plot.rs +++ b/crates/redeem-cli/src/properties/train/plot.rs @@ -1,6 +1,7 @@ use plotly::{Layout, Plot, Scatter}; use plotly::common::{Fill, Mode, Title}; -use crate::training::{TrainingStepMetrics, TrainingPhase}; +use redeem_properties::utils::stats::{TrainingStepMetrics, TrainingPhase}; + pub fn plot_losses( epoch_losses: &[(usize, f32, Option, f32, Option)] @@ -136,7 +137,7 @@ pub fn plot_training_metric( plot.set_layout( Layout::new() - .title(Title::new().text(title)) + .title(title) .x_axis(plotly::layout::Axis::new().title(x_title)) .y_axis(plotly::layout::Axis::new().title(y_title)) ); diff --git a/crates/redeem-cli/src/properties/train/trainer.rs b/crates/redeem-cli/src/properties/train/trainer.rs index 2b2695b..cbd523c 100644 --- a/crates/redeem-cli/src/properties/train/trainer.rs +++ b/crates/redeem-cli/src/properties/train/trainer.rs @@ -15,7 +15,7 @@ use report_builder::{ }; use crate::properties::load_data; -use crate::properties::train::plot::plot_losses; +use crate::properties::train::plot::{plot_losses, plot_training_metric}; use crate::properties::train::sample_peptides; use crate::properties::util::write_bytes_to_file; use input::PropertyTrainConfig; @@ -150,7 +150,7 @@ pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { "ReDeeM", &config.version, Some("https://github.com/singjc/redeem/blob/master/img/redeem_logo.png?raw=true"), - format!("ReDeeM (:?) Trainer Report", config.model_arch).as_str(), + &format!("ReDeeM {:?} Trainer Report", config.model_arch), ); /* Section 1: Overview */ From f1c74a5eb0d9ad4751cabbd6edfbaedb1d6d5d44 Mon Sep 17 00:00:00 2001 From: singjc Date: Mon, 12 May 2025 22:14:26 -0400 Subject: [PATCH 38/75] add: RT Norm struct to set type of normalization --- .../src/properties/inference/inference.rs | 54 ++++++++++++++++--- crates/redeem-cli/src/properties/load_data.rs | 46 +++++++++------- .../src/properties/train/trainer.rs | 16 +++--- .../src/utils/data_handling.rs | 19 +++++++ 4 files changed, 101 insertions(+), 34 deletions(-) diff --git a/crates/redeem-cli/src/properties/inference/inference.rs b/crates/redeem-cli/src/properties/inference/inference.rs index 876af7e..9f642c8 100644 --- a/crates/redeem-cli/src/properties/inference/inference.rs +++ b/crates/redeem-cli/src/properties/inference/inference.rs @@ -1,5 +1,10 @@ use anyhow::{Context, Result}; -use redeem_properties::utils::data_handling::PeptideData; +use redeem_properties::models::ccs_cnn_lstm_model::CCSCNNLSTMModel; +use redeem_properties::models::ccs_cnn_tf_model::CCSCNNTFModel; +use redeem_properties::models::ccs_model::load_collision_cross_section_model; +use redeem_properties::models::rt_cnn_lstm_model::RTCNNLSTMModel; +use redeem_properties::models::model_interface::ModelInterface; +use redeem_properties::utils::data_handling::{PeptideData, RTNormalization}; use redeem_properties::utils::peptdeep_utils::load_modifications; use redeem_properties::utils::utils::get_device; use redeem_properties::models::rt_model::load_retention_time_model; @@ -12,19 +17,52 @@ use crate::properties::inference::output::write_peptide_data; pub fn run_inference(config: &PropertyInferenceConfig) -> Result<()> { // Load inference data - let (inference_data, norm_factor) = load_peptide_data(&config.inference_data, Some(config.nce), Some(config.instrument.clone()), true)?; + let (inference_data, norm_factor) = load_peptide_data(&config.inference_data, Some(config.nce), Some(config.instrument.clone()), Some("min_max".to_string()))?; log::info!("Loaded {} peptides", inference_data.len()); // Dispatch model training based on architecture let model_arch = config.model_arch.as_str(); let device = get_device(&config.device)?; - let mut model = load_retention_time_model( - &config.model_path, - None, - &config.model_arch, - device.clone(), - )?; + let mut model: Box = match model_arch { + "rt_cnn_lstm" => Box::new(RTCNNLSTMModel::new( + &config.model_path, + None, + 0, + 8, + 4, + true, + device.clone(), + )?), + "rt_cnn_tf" => Box::new(RTCNNLSTMModel::new( + &config.model_path, + None, + 0, + 8, + 4, + true, + device.clone(), + )?), + "ccs_cnn_lstm" => Box::new(CCSCNNLSTMModel::new( + &config.model_path, + None, + 0, + 8, + 4, + true, + device.clone(), + )?), + "ccs_cnn_tf" => Box::new(CCSCNNTFModel::new( + &config.model_path, + None, + 0, + 8, + 4, + true, + device.clone(), + )?), + _ => return Err(anyhow::anyhow!("Unsupported RT model architecture: {}", model_arch)), + }; let modifications = load_modifications().context("Failed to load modifications")?; diff --git a/crates/redeem-cli/src/properties/load_data.rs b/crates/redeem-cli/src/properties/load_data.rs index 6e85e22..0646320 100644 --- a/crates/redeem-cli/src/properties/load_data.rs +++ b/crates/redeem-cli/src/properties/load_data.rs @@ -3,7 +3,8 @@ use std::path::Path; use std::io::BufReader; use anyhow::{Result, Context}; use csv::ReaderBuilder; -use redeem_properties::utils::data_handling::PeptideData; +use redeem_properties::utils::data_handling::{PeptideData, RTNormalization}; + /// Load peptide training data from a CSV or TSV file and optionally normalize RT. /// @@ -12,8 +13,8 @@ pub fn load_peptide_data>( path: P, nce: Option, instrument: Option, - normalize_rt: bool, -) -> Result<(Vec, Option<(f32, f32)>)> { + normalize_rt: Option, +) -> Result<(Vec, RTNormalization)> { let file = File::open(&path) .with_context(|| format!("Failed to open file: {:?}", path.as_ref()))?; let reader = BufReader::new(file); @@ -83,28 +84,33 @@ pub fn load_peptide_data>( retention_time, ion_mobility, ccs, - ms2_intensities: None + ms2_intensities: None, }); } - if normalize_rt && !rt_values.is_empty() { - let mean = rt_values.iter().copied().sum::() / rt_values.len() as f32; - let std = (rt_values - .iter() - .map(|v| (v - mean).powi(2)) - .sum::() - / rt_values.len() as f32) - .sqrt(); - - for peptide in &mut peptides { - if let Some(rt) = peptide.retention_time.as_mut() { - *rt = (*rt - mean) / std; + match RTNormalization::from_str(normalize_rt) { + RTNormalization::ZScore(_, _) if !rt_values.is_empty() => { + let mean = rt_values.iter().copied().sum::() / rt_values.len() as f32; + let std = (rt_values.iter().map(|v| (v - mean).powi(2)).sum::() / rt_values.len() as f32).sqrt(); + for peptide in &mut peptides { + if let Some(rt) = peptide.retention_time.as_mut() { + *rt = (*rt - mean) / std; + } } + Ok((peptides, RTNormalization::ZScore(mean, std))) } - - Ok((peptides, Some((mean, std)))) - } else { - Ok((peptides, None)) + RTNormalization::MinMax(_, _) if !rt_values.is_empty() => { + let min = *rt_values.iter().min_by(|a, b| a.partial_cmp(b).unwrap()).unwrap(); + let max = *rt_values.iter().max_by(|a, b| a.partial_cmp(b).unwrap()).unwrap(); + let range = max - min; + for peptide in &mut peptides { + if let Some(rt) = peptide.retention_time.as_mut() { + *rt = (*rt - min) / range; + } + } + Ok((peptides, RTNormalization::MinMax(min, max))) + } + _ => Ok((peptides, RTNormalization::None)) } } diff --git a/crates/redeem-cli/src/properties/train/trainer.rs b/crates/redeem-cli/src/properties/train/trainer.rs index cbd523c..bee5988 100644 --- a/crates/redeem-cli/src/properties/train/trainer.rs +++ b/crates/redeem-cli/src/properties/train/trainer.rs @@ -6,7 +6,7 @@ use redeem_properties::models::{ ccs_cnn_lstm_model::CCSCNNLSTMModel, ccs_cnn_tf_model::CCSCNNTFModel, rt_cnn_lstm_model::RTCNNLSTMModel, rt_cnn_transformer_model::RTCNNTFModel, }; -use redeem_properties::utils::data_handling::PeptideData; +use redeem_properties::utils::data_handling::{PeptideData, RTNormalization}; use redeem_properties::utils::peptdeep_utils::load_modifications; use redeem_properties::utils::utils::get_device; use report_builder::{ @@ -29,7 +29,7 @@ pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { &config.train_data, Some(config.nce), Some(config.instrument.clone()), - true, + Some(config.rt_normalization.clone().unwrap()), )?; log::info!("Loaded {} training peptides", train_peptides.len()); @@ -39,7 +39,7 @@ pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { val_path, Some(config.nce), Some(config.instrument.clone()), - true, + Some(config.rt_normalization.clone().unwrap()), ) .context("Failed to load validation data")?; (Some(peptides), Some(norm)) @@ -196,7 +196,7 @@ pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { overview_section.add_plot(acc_plot); // Inference scatter plot - let val_peptides: Vec = sample_peptides(&val_peptides.as_ref().unwrap(), 1000); + let val_peptides: Vec = sample_peptides(&val_peptides.as_ref().unwrap(), 5000); let inference_results: Vec = model.inference(&val_peptides, config.batch_size, modifications, norm_factor)?; let (true_rt, pred_rt): (Vec, Vec) = val_peptides @@ -205,14 +205,18 @@ pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { .filter_map(|(true_pep, pred_pep)| { match (true_pep.retention_time, pred_pep.retention_time) { (Some(t), Some(p)) => { - let t_denorm = t as f64 * norm_factor.unwrap().1 as f64 - + norm_factor.unwrap().0 as f64; + let t_denorm = match norm_factor { + RTNormalization::ZScore(mean, std) => t as f64 * std as f64 + mean as f64, + RTNormalization::MinMax(min, range) => t as f64 * range as f64 + min as f64, + RTNormalization::None => t as f64, + }; Some((t_denorm, p as f64)) } _ => None, } }) .unzip(); + let scatter_plot = plot_scatter( &vec![true_rt.clone()], diff --git a/crates/redeem-properties/src/utils/data_handling.rs b/crates/redeem-properties/src/utils/data_handling.rs index 6b09c27..2722122 100644 --- a/crates/redeem-properties/src/utils/data_handling.rs +++ b/crates/redeem-properties/src/utils/data_handling.rs @@ -1,4 +1,23 @@ + +/// Type of RT normalization used +#[derive(Debug, Clone, Copy)] +pub enum RTNormalization { + ZScore(f32, f32), // mean, std + MinMax(f32, f32), // min, max + None, +} + +impl RTNormalization { + pub fn from_str(norm: Option) -> Self { + match norm.as_deref() { + Some("z_score") => RTNormalization::ZScore(0.0, 0.0), + Some("min_max") => RTNormalization::MinMax(0.0, 0.0), + _ => RTNormalization::None, + } + } +} + #[derive(Clone)] pub struct PeptideData { pub sequence: String, From 92c7134de21b9a545777c6a8e6d47883913a2741 Mon Sep 17 00:00:00 2001 From: singjc Date: Mon, 12 May 2025 22:14:34 -0400 Subject: [PATCH 39/75] refactor: Update config loading logic in redeem-cli crate --- .../src/properties/inference/input.rs | 40 +++++++++--- .../redeem-cli/src/properties/train/input.rs | 64 +++++++++++++------ 2 files changed, 77 insertions(+), 27 deletions(-) diff --git a/crates/redeem-cli/src/properties/inference/input.rs b/crates/redeem-cli/src/properties/inference/input.rs index 0e6119e..fcdacd5 100644 --- a/crates/redeem-cli/src/properties/inference/input.rs +++ b/crates/redeem-cli/src/properties/inference/input.rs @@ -38,27 +38,51 @@ impl PropertyInferenceConfig { let config_json = fs::read_to_string(config_path) .with_context(|| format!("Failed to read config file: {:?}", config_path))?; - let mut config: PropertyInferenceConfig = serde_json::from_str(&config_json) - .unwrap_or_else(|_| PropertyInferenceConfig::default()); + let partial: serde_json::Value = serde_json::from_str(&config_json)?; + let mut config = PropertyInferenceConfig::default(); + + macro_rules! load_or_default { + ($field:ident) => { + if let Some(val) = partial.get(stringify!($field)) { + if let Ok(parsed) = serde_json::from_value(val.clone()) { + config.$field = parsed; + } else { + log::warn!( + "Config Invalid value for '{}', using default: {:?}", + stringify!($field), config.$field + ); + } + } else { + log::warn!( + "Config Missing field '{}', using default: {:?}", + stringify!($field), config.$field + ); + } + }; + } + + load_or_default!(model_path); + load_or_default!(inference_data); + load_or_default!(output_file); + load_or_default!(model_arch); + load_or_default!(device); + load_or_default!(batch_size); + load_or_default!(instrument); + load_or_default!(nce); // Apply CLI overrides if let Some(model_path) = matches.get_one::("model_path") { config.model_path = model_path.clone(); - } else { - config.model_path = config.model_path.clone(); } - if let Some(inference_data) = matches.get_one::("inference_data") { validate_tsv_or_csv_file(inference_data)?; - config.inference_data = inference_data.clone().to_string(); + config.inference_data = inference_data.clone(); } else { validate_tsv_or_csv_file(&config.inference_data)?; } - if let Some(output_file) = matches.get_one::("output_file") { config.output_file = output_file.clone(); } - if let Some(model_arch) = matches.get_one::("model_arch") { config.model_arch = model_arch.clone(); } diff --git a/crates/redeem-cli/src/properties/train/input.rs b/crates/redeem-cli/src/properties/train/input.rs index 58d3fd9..c8523b9 100644 --- a/crates/redeem-cli/src/properties/train/input.rs +++ b/crates/redeem-cli/src/properties/train/input.rs @@ -6,12 +6,14 @@ use anyhow::{Context, Result}; use crate::properties::util::validate_tsv_or_csv_file; + #[derive(Debug, Deserialize, Serialize, Clone)] pub struct PropertyTrainConfig { pub version: String, pub train_data: String, pub validation_data: Option, pub output_file: String, + pub rt_normalization: Option, pub model_arch: String, pub device: String, pub batch_size: usize, @@ -31,6 +33,7 @@ impl Default for PropertyTrainConfig { train_data: String::new(), validation_data: None, output_file: String::from("rt_cnn_tf.safetensors"), + rt_normalization: Some(String::from("min_max")), model_arch: String::from("rt_cnn_tf"), device: String::from("cpu"), batch_size: 64, @@ -46,42 +49,65 @@ impl Default for PropertyTrainConfig { } impl PropertyTrainConfig { - pub fn from_arguments(config_path: &PathBuf, matches: &ArgMatches) -> Result { + pub fn from_arguments(config_path: &PathBuf, matches: &ArgMatches) -> anyhow::Result { let config_json = fs::read_to_string(config_path) - .with_context(|| format!("Failed to read config file: {:?}", config_path))?; + .map_err(|e| anyhow::anyhow!("Failed to read config file: {}", e))?; + + let partial: serde_json::Value = serde_json::from_str(&config_json)?; + let mut config = PropertyTrainConfig::default(); + + macro_rules! load_or_default { + ($field:ident) => { + if let Some(val) = partial.get(stringify!($field)) { + if let Ok(parsed) = serde_json::from_value(val.clone()) { + config.$field = parsed; + } else { + log::warn!( + "Config Invalid value for '{}', using default: {:?}", + stringify!($field), config.$field + ); + } + } else { + log::warn!( + "Config Missing field '{}', using default: {:?}", + stringify!($field), config.$field + ); + } + }; + } - let mut config: PropertyTrainConfig = serde_json::from_str(&config_json) - .unwrap_or_else(|_| PropertyTrainConfig::default()); + load_or_default!(train_data); + load_or_default!(validation_data); + load_or_default!(output_file); + load_or_default!(rt_normalization); + load_or_default!(model_arch); + load_or_default!(device); + load_or_default!(batch_size); + load_or_default!(validation_batch_size); + load_or_default!(learning_rate); + load_or_default!(epochs); + load_or_default!(early_stopping_patience); + load_or_default!(checkpoint_file); + load_or_default!(instrument); + load_or_default!(nce); // Apply CLI overrides if let Some(train_data) = matches.get_one::("train_data") { - validate_tsv_or_csv_file(train_data)?; - config.train_data = train_data.clone().to_string(); - } else { - validate_tsv_or_csv_file(&config.train_data)?; + config.train_data = train_data.clone(); } - if let Some(validation_data) = matches.get_one::("validation_data") { - validate_tsv_or_csv_file(validation_data)?; - config.validation_data = Some(validation_data.clone().to_string()); - } else if let Some(val_data) = &config.validation_data { - validate_tsv_or_csv_file(val_data)?; + config.validation_data = Some(validation_data.clone()); } - if let Some(output_file) = matches.get_one::("output_file") { config.output_file = output_file.clone(); } - if let Some(model_arch) = matches.get_one::("model_arch") { config.model_arch = model_arch.clone(); } - if let Some(checkpoint_file) = matches.get_one::("checkpoint_file") { config.checkpoint_file = Some(checkpoint_file.clone()); } Ok(config) } -} - - +} \ No newline at end of file From f95f0874dc23f6ad59c81d6508371695aaace0c6 Mon Sep 17 00:00:00 2001 From: singjc Date: Mon, 12 May 2025 22:14:50 -0400 Subject: [PATCH 40/75] refactor: Update RT-CNN-LSTM and RT-CNN-Transformer models in redeem-properties crate --- .../src/models/rt_cnn_lstm_model.rs | 12 +++---- .../src/models/rt_cnn_transformer_model.rs | 36 +++++-------------- .../redeem-properties/src/models/rt_model.rs | 4 +-- 3 files changed, 14 insertions(+), 38 deletions(-) diff --git a/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs b/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs index 1cb99c7..19e3f21 100644 --- a/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs +++ b/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs @@ -124,17 +124,13 @@ impl ModelInterface for RTCNNLSTMModel { let (mean, min, max) = get_tensor_stats(&aa_indices_out)?; log::debug!("[RTCNNLSTMModel] aa_indices_out stats - min: {min}, max: {max}, mean: {mean}"); let mod_x_out = xs.i((.., .., 1..1 + MOD_FEATURE_SIZE))?; - let (mean, min, max) = get_tensor_stats(&mod_x_out)?; - log::debug!("[RTCNNLSTMModel] mod_x_out stats - min: {min}, max: {max}, mean: {mean}"); + let x = self.rt_encoder.forward(&aa_indices_out, &mod_x_out)?; - let (mean, min, max) = get_tensor_stats(&x)?; - log::debug!("[RTCNNLSTMModel] x stats - min: {min}, max: {max}, mean: {mean}"); + let x = self.dropout.forward(&x, self.is_training)?; - let (mean, min, max) = get_tensor_stats(&x)?; - log::debug!("[RTCNNLSTMModel] x after dropout stats - min: {min}, max: {max}, mean: {mean}"); + let x = self.rt_decoder.forward(&x)?; - let (mean, min, max) = get_tensor_stats(&x)?; - log::debug!("[RTCNNLSTMModel] x after decoder stats - min: {min}, max: {max}, mean: {mean}"); + let result = x.squeeze(1)?; Ok(result) diff --git a/crates/redeem-properties/src/models/rt_cnn_transformer_model.rs b/crates/redeem-properties/src/models/rt_cnn_transformer_model.rs index 09f1b07..0231fe7 100644 --- a/crates/redeem-properties/src/models/rt_cnn_transformer_model.rs +++ b/crates/redeem-properties/src/models/rt_cnn_transformer_model.rs @@ -54,7 +54,7 @@ impl ModelInterface for RTCNNTFModel { let rt_encoder = Encoder26aaModCnnTransformerAttnSum::new( &varbuilder.pp("rt_encoder"), 8, // mod_hidden_dim - 140, // hidden_dim + 128, // hidden_dim 256, // ff_dim 4, // num_heads 2, // num_layers @@ -64,7 +64,7 @@ impl ModelInterface for RTCNNTFModel { )?; log::trace!("[RTCNNTFModel] Initializing rt_decoder"); - let rt_decoder = DecoderLinear::new(140, 1, &varbuilder.pp("rt_decoder"))?; + let rt_decoder = DecoderLinear::new(128, 1, &varbuilder.pp("rt_decoder"))?; let constants = ModelConstants::default(); let mod_to_feature = load_mod_to_feature(&constants)?; @@ -107,7 +107,7 @@ impl ModelInterface for RTCNNTFModel { let rt_encoder = Encoder26aaModCnnTransformerAttnSum::from_varstore( &var_store, 8, // mod_hidden_dim - 140, // hidden_dim + 128, // hidden_dim 256, // ff_dim 4, // num_heads 2, // num_layers @@ -132,7 +132,7 @@ impl ModelInterface for RTCNNTFModel { let rt_decoder = DecoderLinear::from_varstore( &var_store, - 140, + 128, 1, vec!["rt_decoder.nn.0.weight", "rt_decoder.nn.1.weight", "rt_decoder.nn.2.weight"], vec!["rt_decoder.nn.0.bias", "rt_decoder.nn.2.bias"] @@ -156,33 +156,13 @@ impl ModelInterface for RTCNNTFModel { let (mean, min, max) = get_tensor_stats(&aa_indices_out)?; log::debug!("[RTCNNTFModel] aa_indices_out stats - min: {min}, max: {max}, mean: {mean}"); let mod_x_out = xs.i((.., .., 1..1 + MOD_FEATURE_SIZE))?; - - if mod_x_out.shape().elem_count() == 0 { - log::error!("[RTCNNTFModel] mod_x_out is empty! shape: {:?}", mod_x_out.shape()); - } else { - match get_tensor_stats(&mod_x_out) { - Ok((mean, min, max)) => { - log::debug!("[RTCNNTFModel] mod_x_out stats - min: {min}, max: {max}, mean: {mean}"); - } - Err(e) => { - log::error!("[RTCNNTFModel] Failed to compute stats for mod_x_out: {:?}", e); - } - } - } - - log::trace!("[RTCNNTFModel] aa_indices_out: {:?}, mod_x_out: {:?}", aa_indices_out, mod_x_out); + let x = self.rt_encoder.forward(&aa_indices_out, &mod_x_out)?; - log::trace!("[RTCNNTFModel] x.shape after rt_encoder: {:?}", x.shape()); - let (mean, min, max) = get_tensor_stats(&x)?; - log::debug!("[RTCNNTFModel] rt_encoder output stats - min: {min}, max: {max}, mean: {mean}"); + let x = self.dropout.forward(&x, self.is_training)?; - log::trace!("[RTCNNTFModel] x.shape after dropout: {:?}", x.shape()); - let (mean, min, max) = get_tensor_stats(&x)?; - log::debug!("[RTCNNTFModel] dropout output stats - min: {min}, max: {max}, mean: {mean}"); + let x = self.rt_decoder.forward(&x)?; - log::trace!("[RTCNNTFModel] x.shape after rt_decoder: {:?}", x.shape()); - let (mean, min, max) = get_tensor_stats(&x)?; - log::debug!("[RTCNNTFModel] rt_decoder output stats - min: {min}, max: {max}, mean: {mean}"); + Ok(x.squeeze(1)?) } diff --git a/crates/redeem-properties/src/models/rt_model.rs b/crates/redeem-properties/src/models/rt_model.rs index 3b9672c..adab38c 100644 --- a/crates/redeem-properties/src/models/rt_model.rs +++ b/crates/redeem-properties/src/models/rt_model.rs @@ -8,7 +8,7 @@ use candle_nn::VarMap; use crate::models::model_interface::{ModelInterface,PredictionResult}; use crate::models::rt_cnn_lstm_model::RTCNNLSTMModel; use crate::models::rt_cnn_transformer_model::RTCNNTFModel; -use crate::utils::data_handling::PeptideData; +use crate::utils::data_handling::{PeptideData, RTNormalization}; use crate::utils::stats::TrainingStepMetrics; use std::collections::HashMap; use crate::utils::peptdeep_utils::ModificationMap; @@ -59,7 +59,7 @@ impl RTModelWrapper { self.model.fine_tune(training_data, modifications, batch_size, learning_rate, epochs) } - pub fn inference(&mut self, inference_data: &Vec, batch_size: usize, modifications: HashMap<(String, Option), ModificationMap>, rt_norm_params: Option<(f32, f32)>,) -> Result> { + pub fn inference(&mut self, inference_data: &Vec, batch_size: usize, modifications: HashMap<(String, Option), ModificationMap>, rt_norm_params: RTNormalization,) -> Result> { self.model.inference(inference_data, batch_size, modifications, rt_norm_params) } From 4dd8900bfae10a6865387acab4b9b36dee42fa89 Mon Sep 17 00:00:00 2001 From: singjc Date: Mon, 12 May 2025 22:15:04 -0400 Subject: [PATCH 41/75] refactor: Update hidden_dim and decoder size in CCSCNNTFModel --- .../src/models/ccs_cnn_tf_model.rs | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/crates/redeem-properties/src/models/ccs_cnn_tf_model.rs b/crates/redeem-properties/src/models/ccs_cnn_tf_model.rs index e821546..ab41d5c 100644 --- a/crates/redeem-properties/src/models/ccs_cnn_tf_model.rs +++ b/crates/redeem-properties/src/models/ccs_cnn_tf_model.rs @@ -57,7 +57,7 @@ impl ModelInterface for CCSCNNTFModel { let ccs_encoder = Encoder26aaModChargeCnnTransformerAttnSum::new( &varbuilder.pp("ccs_encoder"), 8, // mod_hidden_dim - 140, // hidden_dim + 128, // hidden_dim 256, // ff_dim 4, // num_heads 2, // num_layers @@ -67,7 +67,7 @@ impl ModelInterface for CCSCNNTFModel { )?; log::trace!("[CCSCNNTFModel] Initializing ccs_decoder"); - let ccs_decoder = DecoderLinear::new(141, 1, &varbuilder.pp("ccs_decoder"))?; + let ccs_decoder = DecoderLinear::new(129, 1, &varbuilder.pp("ccs_decoder"))?; let constants = ModelConstants::default(); let mod_to_feature = load_mod_to_feature(&constants)?; @@ -110,7 +110,7 @@ impl ModelInterface for CCSCNNTFModel { let ccs_encoder = Encoder26aaModChargeCnnTransformerAttnSum::from_varstore( &var_store, 8, // mod_hidden_dim - 140, // hidden_dim + 128, // hidden_dim 256, // ff_dim 4, // num_heads 2, // num_layers @@ -135,7 +135,7 @@ impl ModelInterface for CCSCNNTFModel { let ccs_decoder = DecoderLinear::from_varstore( &var_store, - 141, + 129, 1, vec!["ccs_decoder.nn.0.weight", "ccs_decoder.nn.1.weight", "ccs_decoder.nn.2.weight"], vec!["ccs_decoder.nn.0.bias", "ccs_decoder.nn.2.bias"] @@ -162,23 +162,23 @@ impl ModelInterface for CCSCNNTFModel { let start_charge = start_mod_x + MOD_FEATURE_SIZE; let aa_indices_out = xs.i((.., .., 0))?; + let (mean, min, max) = get_tensor_stats(&aa_indices_out)?; + log::debug!("[CCSCNNTFModel] aa_indices_out stats - min: {min}, max: {max}, mean: {mean}"); + let mod_x_out = xs.i((.., .., start_mod_x..start_mod_x + MOD_FEATURE_SIZE))?; let charge_out = xs.i((.., 0..1, start_charge..start_charge + 1))?; let charge_out = charge_out.squeeze(2)?; let x = self.ccs_encoder.forward(&aa_indices_out, &mod_x_out, &charge_out)?; - let (mean, min, max) = get_tensor_stats(&x)?; - log::debug!("[CCSCNNTFModel] ccs_encoder output stats - min: {min}, max: {max}, mean: {mean}"); + let x = self.dropout.forward(&x, self.is_training)?; - log::trace!("[CCSCNNTFModel] x.shape after dropout: {:?}", x.shape()); - let (mean, min, max) = get_tensor_stats(&x)?; - log::debug!("[CCSCNNTFModel] dropout output stats - min: {min}, max: {max}, mean: {mean}"); + + + let x = Tensor::cat(&[x, charge_out], 1)?; let x = self.ccs_decoder.forward(&x)?; - log::trace!("[CCSCNNTFModel] x.shape after ccs_decoder: {:?}", x.shape()); - let (mean, min, max) = get_tensor_stats(&x)?; - log::debug!("[CCSCNNTFModel] ccs_decoder output stats - min: {min}, max: {max}, mean: {mean}"); + Ok(x.squeeze(1)?) } From 47ab97611100c75b517bd5161baaa636982cf29c Mon Sep 17 00:00:00 2001 From: singjc Date: Mon, 12 May 2025 22:15:26 -0400 Subject: [PATCH 42/75] refactor: clean up trace comments --- .../src/building_blocks/bilstm.rs | 11 +- .../src/building_blocks/building_blocks.rs | 213 ++++++------------ .../src/building_blocks/featurize.rs | 46 +++- .../src/building_blocks/nn.rs | 56 +---- .../src/models/model_interface.rs | 63 ++++-- 5 files changed, 148 insertions(+), 241 deletions(-) diff --git a/crates/redeem-properties/src/building_blocks/bilstm.rs b/crates/redeem-properties/src/building_blocks/bilstm.rs index 68d5204..43d4e1a 100644 --- a/crates/redeem-properties/src/building_blocks/bilstm.rs +++ b/crates/redeem-properties/src/building_blocks/bilstm.rs @@ -59,7 +59,6 @@ impl BidirectionalLSTM { let c0_forward = c0.i(0)?; let state_fw = rnn::LSTMState { h: h0_forward, c: c0_forward }; - let start_time = std::time::Instant::now(); let out_fw_states = lstm_forward.seq_init(input, &state_fw)?; let out_fw = Tensor::stack( &out_fw_states.iter().map(|s| s.h()).collect::>(), @@ -67,10 +66,8 @@ impl BidirectionalLSTM { )?; let last_fw_h = out_fw_states.last().unwrap().h().clone(); let last_fw_c = out_fw_states.last().unwrap().c().clone(); - log::trace!("BidirectionLSTM::apply_bidirectional_layer - Forward LSTM time: {:?}", start_time.elapsed()); // Reverse sequence - let start_time = std::time::Instant::now(); let input_reversed = Tensor::cat( &(0..seq_len) .rev() @@ -78,14 +75,12 @@ impl BidirectionalLSTM { .collect::>>()?, 1, )?; - log::trace!("BidirectionLSTM::apply_bidirectional_layer - Reverse sequence time: {:?}", start_time.elapsed()); // Initial states for backward let h0_backward = h0.i(1)?; let c0_backward = c0.i(1)?; let state_bw = rnn::LSTMState { h: h0_backward, c: c0_backward }; - let start_time = std::time::Instant::now(); let out_bw_states = lstm_backward.seq_init(&input_reversed, &state_bw)?; let out_bw = Tensor::stack( &out_bw_states.iter().map(|s| s.h()).collect::>(), @@ -93,7 +88,6 @@ impl BidirectionalLSTM { )?; let last_bw_h = out_bw_states.last().unwrap().h().clone(); let last_bw_c = out_bw_states.last().unwrap().c().clone(); - log::trace!("BidirectionLSTM::apply_bidirectional_layer - Backward LSTM time: {:?}", start_time.elapsed()); // Combine hidden and cell states let hn = Tensor::stack(&[last_fw_h.clone(), last_bw_h.clone()], 0)?; @@ -116,12 +110,9 @@ impl BidirectionalLSTM { let h0_2 = h0.narrow(0, 2, 2)?; let c0_2 = c0.narrow(0, 2, 2)?; - let start_time = std::time::Instant::now(); let (out1, (hn1, cn1)) = self.apply_bidirectional_layer(xs, &self.forward_lstm1, &self.backward_lstm1, &h0_1, &c0_1)?; - log::trace!("BidirectionLSTM::forward_with_state - Layer 1 time: {:?}", start_time.elapsed()); - let start_time = std::time::Instant::now(); + let (out2, (hn2, cn2)) = self.apply_bidirectional_layer(&out1, &self.forward_lstm2, &self.backward_lstm2, &h0_2, &c0_2)?; - log::trace!("BidirectionLSTM::forward_with_state - Layer 2 time: {:?}", start_time.elapsed()); let hn = Tensor::cat(&[hn1, hn2], 0)?; let cn = Tensor::cat(&[cn1, cn2], 0)?; diff --git a/crates/redeem-properties/src/building_blocks/building_blocks.rs b/crates/redeem-properties/src/building_blocks/building_blocks.rs index c1e164d..132d067 100644 --- a/crates/redeem-properties/src/building_blocks/building_blocks.rs +++ b/crates/redeem-properties/src/building_blocks/building_blocks.rs @@ -1,6 +1,6 @@ use anyhow::{Context, Result as AnyHowResult}; use candle_core::{DType, Device, Module, Result, Tensor, D}; -use candle_nn as nn; +use candle_nn::{self as nn, linear}; use candle_transformers as transformers; use serde::de; use core::num; @@ -28,13 +28,10 @@ pub struct DecoderLinear { impl DecoderLinear { pub fn new(in_features: usize, out_features: usize, vb: &nn::VarBuilder) -> Result { - log::trace!("[DecoderLinear::new] Initializing linear1"); let linear1 = nn::linear(in_features, 64, vb.pp("nn.0"))?; - log::trace!("[DecoderLinear::new] Initializing prelu"); let prelu = nn::PReLU::new(Tensor::zeros(64, DType::F32, vb.device())?, false); - log::trace!("[DecoderLinear::new] Initializing linear2"); let linear2 = nn::linear(64, out_features, vb.pp("nn.2"))?; - log::trace!("[DecoderLinear::new] Initializing sequential"); + let mut nn = seq(); nn = nn.add(linear1); nn = nn.add(prelu); @@ -73,16 +70,8 @@ impl DecoderLinear { impl Module for DecoderLinear { fn forward(&self, x: &Tensor) -> Result { - log::trace!("[DecoderLinear] input shape: {:?}", x.shape()); match self.nn.forward(x) { Ok(output) => { - log::trace!("[DecoderLinear] output shape: {:?}", output.shape()); - log::trace!( - "[DecoderLinear] output stats - min: {:.4}, max: {:.4}, mean: {:.4}", - output.min_all()?.to_vec0::()?, - output.max_all()?.to_vec0::()?, - output.mean_all()?.to_vec0::()?, - ); Ok(output) } Err(e) => { @@ -120,7 +109,6 @@ impl AAEmbedding { fn from_varstore(varstore: &nn::VarBuilder, hidden_size: usize, name: &str) -> Result { let weight = varstore.get((AA_EMBEDDING_SIZE, hidden_size), name)?; - log::trace!("[AAEmbedding::from_varstore] weight shape (AA_EMBEDDING_SIZE, hidden_size): {:?}, device: {:?}", weight.shape(), weight.device()); let embeddings = nn::Embedding::new(weight, hidden_size); Ok(Self { embeddings }) } @@ -128,13 +116,7 @@ impl AAEmbedding { impl Module for AAEmbedding { fn forward(&self, x: &Tensor) -> Result { - log::trace!("[AAEmbedding::forward] x shape: {:?}, device: {:?}, min: {:?}, max: {:?}", - x.shape(), x.device(), x.min_all(), x.max_all()); - let x = x.to_dtype(DType::I64)?; - log::trace!("[AAEmbedding::forward] x (converted to i64) shape: {:?}, device: {:?}, min: {:?}, max: {:?}", - x.shape(), x.device(), x.min_all(), x.max_all()); - self.embeddings.forward(&x) } } @@ -217,7 +199,7 @@ struct ModEmbeddingFixFirstK { impl ModEmbeddingFixFirstK { fn new(mod_feature_size: usize, out_features: usize, varbuilder: &nn::VarBuilder) -> Result { let k = 6; - let nn = nn::linear(mod_feature_size - k, out_features - k, varbuilder.pp("linear"))?; + let nn = nn::linear_no_bias(mod_feature_size - k, out_features - k, varbuilder.pp("nn"))?; Ok(Self { k, nn }) } @@ -288,19 +270,10 @@ impl Input26aaModPositionalEncoding { } pub fn forward(&self, aa_indices: &Tensor, mod_x: &Tensor) -> Result { - log::trace!("[Input26aaModPositionalEncoding::forward] aa_indices shape: {:?}, device: {:?}, min: {:?}, max: {:?}", - aa_indices.shape(), aa_indices.device(), aa_indices.min_all(),aa_indices.max_all()); - - log::trace!("[Input26aaModPositionalEncoding::forward] mod_x shape: {:?}, device: {:?}", mod_x.shape(), mod_x.device()); - let mod_x = self.mod_nn.forward(mod_x)?; - log::trace!("[Input26aaModPositionalEncoding::forward] mod_x (after mod_nn) shape: {:?}, device: {:?}", mod_x.shape(), mod_x.device()); let x = self.aa_emb.forward(aa_indices)?; - log::trace!("[Input26aaModPositionalEncoding::forward] x (after aa_emb) shape: {:?}, device: {:?}", x.shape(), x.device()); - // Concatenate x and mod_x along the last dimension let concatenated = Tensor::cat(&[&x, &mod_x], 2)?; - log::trace!("[Input26aaModPositionalEncoding::forward] concatenated shape: {:?}, device: {:?}", concatenated.shape(), concatenated.device()); self.pos_encoder.forward(&concatenated) } } @@ -332,34 +305,6 @@ impl MetaEmbedding { Ok(Self { nn }) } - // fn one_hot(&self, indices: &Tensor, num_classes: usize) -> AnyHowResult { - // let batch_size = indices.dim(0)?; - - // let mut one_hot_data = vec![0.0f32; batch_size * num_classes]; - - // for i in 0..batch_size { - // let index = indices.get(i)?.to_scalar::()?; - // let class_idx = index as usize; - - // if class_idx < num_classes { - // one_hot_data[i * num_classes + class_idx] = 1.0; - // } else { - // return Err(anyhow::anyhow!( - // "Index {} out of bounds for one-hot encoding", - // class_idx - // )); - // } - // } - - // log::trace!("one hot encoded data of shape: {:?} on device: {:?}", (batch_size, num_classes), indices.device()); - - // log::trace!("one hot encoded data: {:?}", Tensor::from_slice(&one_hot_data, (batch_size, num_classes), indices.device()) - // .context("Failed to create tensor from one-hot data")); - - // Tensor::from_slice(&one_hot_data, (batch_size, num_classes), indices.device()) - // .context("Failed to create tensor from one-hot data") - // } - fn one_hot(&self, indices: &Tensor, num_classes: usize) -> Result { let batch_size = indices.dim(0)?; let device = indices.device(); @@ -385,8 +330,6 @@ impl MetaEmbedding { // Create a tensor from the one-hot data let one_hot = Tensor::from_slice(&one_hot_data, (batch_size, num_classes), device)?; - log::trace!("[MetaEmbedding::one_hot] one hot encoded data shape: {:?}, device: {:?}", one_hot.shape(), one_hot.device()); - Ok(one_hot) } @@ -396,39 +339,23 @@ impl MetaEmbedding { nces: &Tensor, instrument_indices: &Tensor, ) -> Result { - // Log input tensors - log::trace!("[MetaEmbedding::forward] charges shape: {:?}, device: {:?}", charges.shape(), charges.device()); - log::trace!("[MetaEmbedding::forward] nces shape: {:?}, device: {:?}", nces.shape(), nces.device()); - log::trace!("[MetaEmbedding::forward] instrument_indices shape: {:?}, device: {:?}", instrument_indices.shape(), instrument_indices.device()); - log::trace!("[MetaEmbedding::forward] charges: {:?}", charges.to_vec2::()?); - - // // Ensure instrument_indices is a 1D tensor - // let instrument_indices = instrument_indices.squeeze(1)?; // Remove the second dimension - // log::trace!("[MetaEmbedding::forward] instrument_indices (after squeeze) shape: {:?}, device: {:?}", instrument_indices.shape(), instrument_indices.device()); // One-hot encode the instrument indices let inst_x = self.one_hot(&instrument_indices.to_dtype(DType::I64)?, MAX_INSTRUMENT_NUM)?; - log::trace!("[MetaEmbedding::forward] inst_x shape: {:?}, device: {:?}", inst_x.shape(), inst_x.device()); - // Ensure all tensors are on the same device let charges = &charges.to_device(inst_x.device())?; let nces = &nces.to_device(inst_x.device())?; - log::trace!("[MetaEmbedding::forward] charges (after to_device) shape: {:?}, device: {:?}", charges.shape(), charges.device()); - log::trace!("[MetaEmbedding::forward] nces (after to_device) shape: {:?}, device: {:?}", nces.shape(), nces.device()); // Concatenate the one-hot encoded instrument indices with NCEs let combined_input = Tensor::cat(&[&inst_x, nces], 1)?; - log::trace!("[MetaEmbedding::forward] combined_input shape: {:?}, device: {:?}", combined_input.shape(), combined_input.device()); // Pass through the linear layer let meta_x = self.nn.forward(&combined_input)?; - log::trace!("[MetaEmbedding::forward] meta_x shape: {:?}, device: {:?}", meta_x.shape(), meta_x.device()); // Concatenate the output with charges let meta_x = Tensor::cat(&[&meta_x, charges], 1)?; - log::trace!("[MetaEmbedding::forward] final meta_x shape: {:?}, device: {:?}", meta_x.shape(), meta_x.device()); Ok(meta_x) } @@ -685,11 +612,20 @@ impl Module for SeqCNN { fn forward(&self, x: &Tensor) -> Result { let x = x.transpose(1, 2)?; - let short = self.cnn_short.forward(&x)?; + let short = match self.cnn_short.forward(&x) { + Ok(output) => output, + Err(e) => { + log::error!("[SeqCNN::forward] cnn_short.forward failed: {:?}", e); + return Err(e); + } + }; + let medium = self.cnn_medium.forward(&x)?; + let long = self.cnn_long.forward(&x)?; let output = Tensor::cat(&[x, short, medium, long], 1)?; + Ok(output.transpose(1, 2)?) } } @@ -801,16 +737,7 @@ impl SeqTransformer { impl Module for SeqTransformer { fn forward(&self, x: &Tensor) -> Result { - // Add check to ensure input feature dim matches expected model dim - let (_b, _t, d) = x.dims3()?; - let model_dim = self.encoder.model_dim; - if d != model_dim { - return Err(candle_core::Error::Msg(format!( - "SeqTransformer received input with dim {} but expected {}", - d, model_dim - ))); - } - self.encoder.forward_with_mask(x, None, self.training) + Ok(self.encoder.forward_with_mask(x, None, self.training)?) } } @@ -823,10 +750,10 @@ struct SeqAttentionSum { impl SeqAttentionSum { pub fn new(hidden_dim: usize, varbuilder: &nn::VarBuilder) -> Result { - let attention = nn::Linear::new( - varbuilder.get((1, hidden_dim), "attention.weight")?, - None, - ); + let attention = nn::linear_no_bias( + hidden_dim, + 1, + varbuilder.pp("attn.0"))?; Ok(Self { attention }) } @@ -838,7 +765,14 @@ impl SeqAttentionSum { impl Module for SeqAttentionSum { fn forward(&self, x: &Tensor) -> Result { - let attention_weights = self.attention.forward(x)?; + let attention_weights = match self.attention.forward(x) { + Ok(weights) => weights, + Err(e) => { + log::error!("Attention forward pass failed: {}", e); + return Err(e); + } + }; + // Apply softmax to normalize weights // TODO: This is done in the model itself in the PyTorch implementation @@ -911,34 +845,23 @@ impl Encoder26aaModCnnLstmAttnSum { } pub fn forward(&self, aa_indices: &Tensor, mod_x: &Tensor) -> Result { - - let start_time = Instant::now(); let mod_x = self.mod_nn.forward(mod_x)?; - log::trace!("Encoder26aaModCnnLstmAttnSum::forward - mod_x forward time: {:.3?}", start_time.elapsed()); - let start_time = Instant::now(); + let additional_tensors: Vec<&Tensor> = vec![&mod_x]; - log::trace!("Encoder26aaModCnnLstmAttnSum::forward - additional_tensors forward time: {:.3?}", start_time.elapsed()); - let start_time = Instant::now(); + let x = aa_one_hot(&aa_indices, &additional_tensors) .map_err(|e| candle_core::Error::Msg(e.to_string()))?; - log::trace!("Encoder26aaModCnnLstmAttnSum::forward - aa_one_hot forward time: {:.3?}", start_time.elapsed()); + let (mean, min, max) = get_tensor_stats(&x)?; - log::debug!("[Encoder26aaModCnnLstmAttnSum] one-hot output stats - min: {min}, max: {max}, mean: {mean}"); + log::trace!("[Encoder26aaModCnnLstmAttnSum] one-hot output stats - min: {min}, max: {max}, mean: {mean}"); + let start_time = Instant::now(); let x = self.input_cnn.forward(&x)?; - log::trace!("Encoder26aaModCnnLstmAttnSum::forward - input_cnn forward time: {:.3?}", start_time.elapsed()); - let (mean, min, max) = get_tensor_stats(&x)?; - log::debug!("[Encoder26aaModCnnLstmAttnSum] CNN output stats - min: {min}, max: {max}, mean: {mean}"); - let start_time = Instant::now(); + let x = self.input_lstm.forward(&x)?; - log::trace!("Encoder26aaModCnnLstmAttnSum::forward - input_lstm forward time: {:.3?}", start_time.elapsed()); - let (mean, min, max) = get_tensor_stats(&x)?; - log::debug!("[Encoder26aaModCnnLstmAttnSum] LSTM output stats - min: {min}, max: {max}, mean: {mean}"); - let start_time = Instant::now(); + let x = self.attn_sum.forward(&x)?; - log::trace!("Encoder26aaModCnnLstmAttnSum::forward - attn_sum forward time: {:.3?}", start_time.elapsed()); - let (mean, min, max) = get_tensor_stats(&x)?; - log::debug!("[Encoder26aaModCnnLstmAttnSum] AttentionSum output stats - min: {min}, max: {max}, mean: {mean}"); + Ok(x) } } @@ -1006,15 +929,15 @@ impl Encoder26aaModChargeCnnLstmAttnSum { let x = aa_one_hot(&aa_indices, &additional_tensors) .map_err(|e| candle_core::Error::Msg(e.to_string()))?; - let start_time = Instant::now(); + let (mean, min, max) = get_tensor_stats(&x)?; + log::trace!("[Encoder26aaModChargeCnnLstmAttnSum] one-hot output stats - min: {min}, max: {max}, mean: {mean}"); + let x = self.input_cnn.forward(&x)?; - log::trace!("Encoder26aaModChargeCnnLstmAttnSum::forward - input_cnn forward time: {:.3?}", start_time.elapsed()); - let start_time = Instant::now(); + let x = self.input_lstm.forward(&x)?; - log::trace!("Encoder26aaModChargeCnnLstmAttnSum::forward - input_lstm forward time: {:.3?}", start_time.elapsed()); - let start_time = Instant::now(); + let x = self.attn_sum.forward(&x)?; - log::trace!("Encoder26aaModChargeCnnLstmAttnSum::forward - attn_sum forward time: {:.3?}", start_time.elapsed()); + Ok(x) } } @@ -1025,6 +948,7 @@ impl Encoder26aaModChargeCnnLstmAttnSum { pub struct Encoder26aaModCnnTransformerAttnSum { mod_nn: ModEmbeddingFixFirstK, input_cnn: SeqCNN, + proj_cnn_to_transformer: candle_nn::Linear, input_transformer: SeqTransformer, attn_sum: SeqAttentionSum, } @@ -1060,6 +984,10 @@ impl Encoder26aaModCnnTransformerAttnSum { names_input_cnn_weight, names_input_cnn_bias, )?, + proj_cnn_to_transformer: candle_nn::Linear::new( + varstore.get((input_dim * 4, hidden_dim), "proj_cnn_to_transformer.weight")?, + Some(varstore.get(hidden_dim, "proj_cnn_to_transformer.bias")?), + ), input_transformer: SeqTransformer::from_varstore( varstore.pp(transformer_pp).clone(), input_dim * 4, @@ -1095,6 +1023,7 @@ impl Encoder26aaModCnnTransformerAttnSum { Ok(Self { mod_nn: ModEmbeddingFixFirstK::new(MOD_FEATURE_SIZE, mod_hidden_dim, &varbuilder.pp("mod_nn"))?, input_cnn: SeqCNN::new(input_dim, &varbuilder.pp("input_cnn"))?, + proj_cnn_to_transformer: candle_nn::linear_no_bias(input_dim*4, hidden_dim, varbuilder.pp("proj_cnn_to_transformer"))?, input_transformer: SeqTransformer::new( &varbuilder.pp("input_transformer"), input_dim * 4, @@ -1111,35 +1040,28 @@ impl Encoder26aaModCnnTransformerAttnSum { } pub fn forward(&self, aa_indices: &Tensor, mod_x: &Tensor) -> Result { - let start_time = Instant::now(); let mod_x = self.mod_nn.forward(mod_x)?; - log::trace!("[Encoder26aaModCnnTransformerAttnSum::forward] - mod_x forward time: {:.3?}", start_time.elapsed()); let additional_tensors: Vec<&Tensor> = vec![&mod_x]; - let start_time = Instant::now(); + let x = aa_one_hot(aa_indices, &additional_tensors) .map_err(|e| candle_core::Error::Msg(e.to_string()))?; - log::trace!("[Encoder26aaModCnnTransformerAttnSum::forward] - aa_one_hot forward time: {:.3?}", start_time.elapsed()); + let (mean, min, max) = get_tensor_stats(&x)?; - log::debug!("[Encoder26aaModCnnTransformerAttnSum] one-hot output stats - min: {min}, max: {max}, mean: {mean}"); + log::trace!("[Encoder26aaModCnnTransformerAttnSum] one-hot output stats - min: {min}, max: {max}, mean: {mean}"); + + if !mean.is_finite() || !min.is_finite() || !max.is_finite() { + log::error!("ERROR [Encoder26aaModCnnTransformerAttnSum] aa_one_hot produced non-finite tensor stats: mean={mean}, min={min}, max={max}"); + candle_core::bail!("ERRORNon-finite values found in peptide encoding output."); + } - let start_time = Instant::now(); let x = self.input_cnn.forward(&x)?; - log::trace!("[Encoder26aaModCnnTransformerAttnSum::forward] - input_cnn forward time: {:.3?}", start_time.elapsed()); - let (mean, min, max) = get_tensor_stats(&x)?; - log::debug!("[Encoder26aaModCnnTransformerAttnSum] input_cnn output stats - min: {min}, max: {max}, mean: {mean}"); - let start_time = Instant::now(); + let x = self.proj_cnn_to_transformer.forward(&x)?; + let x = self.input_transformer.forward(&x)?; - log::trace!("[Encoder26aaModCnnTransformerAttnSum::forward] - input_transformer forward time: {:.3?}", start_time.elapsed()); - let (mean, min, max) = get_tensor_stats(&x)?; - log::debug!("[Encoder26aaModCnnTransformerAttnSum] input_transformer output stats - min: {min}, max: {max}, mean: {mean}"); - let start_time = Instant::now(); let x = self.attn_sum.forward(&x)?; - log::trace!("[Encoder26aaModCnnTransformerAttnSum::forward] - attn_sum forward time: {:.3?}", start_time.elapsed()); - let (mean, min, max) = get_tensor_stats(&x)?; - log::debug!("[Encoder26aaModCnnTransformerAttnSum] attn_sum output stats - min: {min}, max: {max}, mean: {mean}"); Ok(x) } @@ -1151,6 +1073,7 @@ impl Encoder26aaModCnnTransformerAttnSum { pub struct Encoder26aaModChargeCnnTransformerAttnSum { mod_nn: ModEmbeddingFixFirstK, input_cnn: SeqCNN, + proj_cnn_to_transformer: candle_nn::Linear, input_transformer: SeqTransformer, attn_sum: SeqAttentionSum, } @@ -1172,7 +1095,7 @@ impl Encoder26aaModChargeCnnTransformerAttnSum { names_attn_sum: Vec<&str>, device: &Device, ) -> Result { - let input_dim = AA_EMBEDDING_SIZE + mod_hidden_dim; + let input_dim = AA_EMBEDDING_SIZE + mod_hidden_dim + 1; Ok(Self { mod_nn: ModEmbeddingFixFirstK::from_varstore( &varstore, @@ -1186,6 +1109,10 @@ impl Encoder26aaModChargeCnnTransformerAttnSum { names_input_cnn_weight, names_input_cnn_bias, )?, + proj_cnn_to_transformer: candle_nn::Linear::new( + varstore.get((input_dim * 4, hidden_dim), "proj_cnn_to_transformer.weight")?, + Some(varstore.get(hidden_dim, "proj_cnn_to_transformer.bias")?), + ), input_transformer: SeqTransformer::from_varstore( varstore.pp(transformer_pp).clone(), input_dim * 4, @@ -1217,10 +1144,11 @@ impl Encoder26aaModChargeCnnTransformerAttnSum { dropout_prob: f32, device: &Device, ) -> Result { - let input_dim = AA_EMBEDDING_SIZE + mod_hidden_dim; + let input_dim = AA_EMBEDDING_SIZE + mod_hidden_dim + 1; Ok(Self { mod_nn: ModEmbeddingFixFirstK::new(MOD_FEATURE_SIZE, mod_hidden_dim, &varbuilder.pp("mod_nn"))?, input_cnn: SeqCNN::new(input_dim, &varbuilder.pp("input_cnn"))?, + proj_cnn_to_transformer: candle_nn::linear_no_bias(input_dim*4, hidden_dim, varbuilder.pp("proj_cnn_to_transformer"))?, input_transformer: SeqTransformer::new( &varbuilder.pp("input_transformer"), input_dim * 4, @@ -1241,24 +1169,21 @@ impl Encoder26aaModChargeCnnTransformerAttnSum { let charges_repeated = charges.unsqueeze(1)?.repeat(&[1, mod_x.dim(1)?, 1])?; let additional_tensors: Vec<&Tensor> = vec![&mod_x, &charges_repeated]; + let x = aa_one_hot(aa_indices, &additional_tensors) .map_err(|e| candle_core::Error::Msg(e.to_string()))?; + let (mean, min, max) = get_tensor_stats(&x)?; - log::debug!("[Encoder26aaModChargeCnnTransformerAttnSum] one-hot output stats - min: {min}, max: {max}, mean: {mean}"); + log::trace!("[Encoder26aaModChargeCnnTransformerAttnSum] one-hot output stats - min: {min}, max: {max}, mean: {mean}"); let x = self.input_cnn.forward(&x)?; - let (mean, min, max) = get_tensor_stats(&x)?; - log::debug!("[Encoder26aaModChargeCnnTransformerAttnSum] input_cnn output stats - min: {min}, max: {max}, mean: {mean}"); + + let x = self.proj_cnn_to_transformer.forward(&x)?; let x = self.input_transformer.forward(&x)?; - let (mean, min, max) = get_tensor_stats(&x)?; - log::debug!("[Encoder26aaModChargeCnnTransformerAttnSum] input_transformer output stats - min: {min}, max: {max}, mean: {mean}"); let x = self.attn_sum.forward(&x)?; - let (mean, min, max) = get_tensor_stats(&x)?; - log::debug!("[Encoder26aaModChargeCnnTransformerAttnSum] attn_sum output stats - min: {min}, max: {max}, mean: {mean}"); - Ok(x) } } diff --git a/crates/redeem-properties/src/building_blocks/featurize.rs b/crates/redeem-properties/src/building_blocks/featurize.rs index 2272612..751464e 100644 --- a/crates/redeem-properties/src/building_blocks/featurize.rs +++ b/crates/redeem-properties/src/building_blocks/featurize.rs @@ -39,31 +39,53 @@ pub fn aa_indices_tensor(seq: &str, device: &Device) -> Result { /// One-hot encode amino acid indices and concatenate additional tensors. pub fn aa_one_hot(aa_indices: &Tensor, cat_others: &[&Tensor]) -> Result { let (batch_size, seq_len) = aa_indices.shape().dims2()?; + log::trace!("[aa_one_hot] batch_size: {}, seq_len: {}", batch_size, seq_len); let num_classes = AA_EMBEDDING_SIZE; - // Extract all indices as f32s once let indices = aa_indices.to_vec2::()?; - - // Preallocate output buffer let mut one_hot_data = vec![0.0f32; batch_size * seq_len * num_classes]; - // Use parallel iterator for speed one_hot_data .par_chunks_mut(seq_len * num_classes) .zip(indices.par_iter()) - .for_each(|(chunk, row)| { + .enumerate() + .try_for_each(|(batch_idx, (chunk, row))| -> Result<()> { for (seq_idx, &fidx) in row.iter().enumerate() { + if !fidx.is_finite() { + return Err(anyhow!( + "Invalid AA index: found NaN or Inf at batch {}, position {}: {}", + batch_idx, seq_idx, fidx + )); + } + + if fidx < 0.0 { + return Err(anyhow!( + "Invalid AA index: negative value at batch {}, position {}: {}", + batch_idx, seq_idx, fidx + )); + } + let class_idx = fidx.round() as usize; - if class_idx < num_classes { - chunk[seq_idx * num_classes + class_idx] = 1.0; + if class_idx >= num_classes { + return Err(anyhow!( + "AA index out of bounds: got {}, but num_classes = {} (batch {}, position {})", + class_idx, num_classes, batch_idx, seq_idx + )); } + + let index = seq_idx * num_classes + class_idx; + chunk[index] = 1.0; } - }); + Ok(()) + })?; - let one_hot_tensor = Tensor::from_slice(&one_hot_data, (batch_size, seq_len, num_classes), aa_indices.device()) - .map_err(|e| anyhow!("Failed to create one-hot tensor: {}", e))?; + let one_hot_tensor = Tensor::from_slice( + &one_hot_data, + (batch_size, seq_len, num_classes), + aa_indices.device(), + ) + .map_err(|e| anyhow!("Failed to create one-hot tensor: {}", e))?; - // Concatenate with additional tensors if cat_others.is_empty() { Ok(one_hot_tensor) } else { @@ -75,6 +97,8 @@ pub fn aa_one_hot(aa_indices: &Tensor, cat_others: &[&Tensor]) -> Result + + /// Get the modification features for a given set of modifications and modification sites. /// /// Based on https://github.com/MannLabs/alphapeptdeep/blob/450518a39a4cd7d03db391108ec8700b365dd436/peptdeep/model/featurize.py#L47 diff --git a/crates/redeem-properties/src/building_blocks/nn.rs b/crates/redeem-properties/src/building_blocks/nn.rs index a8343c7..c460717 100644 --- a/crates/redeem-properties/src/building_blocks/nn.rs +++ b/crates/redeem-properties/src/building_blocks/nn.rs @@ -114,34 +114,17 @@ impl TransformerEncoder { } pub fn forward_with_mask(&self, x: &Tensor, padding_mask: Option<&Tensor>, training: bool) -> Result { - log::trace!("[TransformerEncoder] input x shape: {:?}", x.shape()); - let (mean, min, max) = get_tensor_stats(x)?; - log::debug!("[TransformerEncoder] input stats: mean={}, min={}, max={}", mean, min, max); let (b, t, _) = x.dims3()?; let pe = self.pos_encoding.i((..t, ..))? .unsqueeze(0)? .broadcast_as((b, t, self.pos_encoding.dim(1)?))?; - log::trace!("[TransformerEncoder] positional encoding shape: {:?}", pe.shape()); - let (mean, min, max) = get_tensor_stats(&pe)?; - log::debug!("[TransformerEncoder] positional encoding stats: mean={}, min={}, max={}", mean, min, max); - let mut out = x.broadcast_add(&pe)?; - let (mean, min, max) = get_tensor_stats(&out)?; - log::debug!("[TransformerEncoder] after positional encoding stats: mean={}, min={}, max={}", mean, min, max); out = self.dropout.forward(&out, training)?; - log::trace!("[TransformerEncoder] after dropout shape: {:?}", out.shape()); - let (mean, min, max) = get_tensor_stats(&out)?; - log::debug!("[TransformerEncoder] after dropout stats: mean={}, min={}, max={}", mean, min, max); - - for (i, layer) in self.layers.iter().enumerate() { - log::trace!("[TransformerEncoder] applying layer {}", i); + for (_i, layer) in self.layers.iter().enumerate() { out = layer.forward(&out, padding_mask, training)?; - log::trace!("[TransformerEncoder] output shape after layer {}: {:?}", i, out.shape()); - let (mean, min, max) = get_tensor_stats(&out)?; - log::debug!("[TransformerEncoder] output stats after layer {}: mean={}, min={}, max={}", i, mean, min, max); } Ok(out) } @@ -182,26 +165,12 @@ impl TransformerEncoderLayer { } pub fn forward(&self, x: &Tensor, mask: Option<&Tensor>, training: bool) -> Result { - log::trace!("[TransformerEncoderLayer] input x shape: {:?}", x.shape()); let attn = self.self_attn.forward(x, mask)?; - let (mean, min, max) = get_tensor_stats(&attn)?; - log::debug!("[TransformerEncoderLayer] attention stats: mean={}, min={}, max={}", mean, min, max); let tmp = self.dropout1.forward(&attn, training)?; - let (mean, min, max) = get_tensor_stats(&tmp)?; - log::debug!("[TransformerEncoderLayer] attention after dropout stats: mean={}, min={}, max={}", mean, min, max); let tmp2 = x.broadcast_add(&tmp)?; - let (mean, min, max) = get_tensor_stats(&tmp2)?; - log::debug!("[TransformerEncoderLayer] after residual connection stats: mean={}, min={}, max={}", mean, min, max); let x = self.norm1.forward(&tmp2)?; - let (mean, min, max) = get_tensor_stats(&x)?; - log::debug!("[TransformerEncoderLayer] after norm1 stats: mean={}, min={}, max={}", mean, min, max); let ff = self.ff.forward(&x)?; - let (mean, min, max) = get_tensor_stats(&ff)?; - log::debug!("[TransformerEncoderLayer] feedforward stats: mean={}, min={}, max={}", mean, min, max); let result = self.norm2.forward(&x.broadcast_add(&self.dropout2.forward(&ff, training)?)?)?; - log::trace!("[TransformerEncoderLayer] output shape: {:?}", result.shape()); - let (mean, min, max) = get_tensor_stats(&result)?; - log::debug!("[TransformerEncoderLayer] output stats: mean={}, min={}, max={}", mean, min, max); Ok(result) } } @@ -237,34 +206,21 @@ impl MultiHeadAttention { pub fn forward(&self, x: &Tensor, mask: Option<&Tensor>) -> Result { let (b, t, _) = x.dims3()?; - log::trace!("[MultiHeadAttention] Input shape: b={}, t={}, head_dim={} (num_heads={})", b, t, self.head_dim, self.num_heads); let q = self.proj_q.forward(x)? .reshape((b, t, self.num_heads, self.head_dim))? .transpose(1, 2)? .contiguous()?; - log::trace!("[MultiHeadAttention] Q shape after projection and transpose: {:?}", q.shape()); - let (mean, min, max) = get_tensor_stats(&q)?; - log::debug!("[MultiHeadAttention] Q stats: mean={}, min={}, max={}", mean, min, max); let k = self.proj_k.forward(x)? .reshape((b, t, self.num_heads, self.head_dim))? .transpose(1, 2)? .contiguous()?; - log::trace!("[MultiHeadAttention] K shape after projection and transpose: {:?}", k.shape()); - let (mean, min, max) = get_tensor_stats(&k)?; - log::debug!("[MultiHeadAttention] K stats: mean={}, min={}, max={}", mean, min, max); let v = self.proj_v.forward(x)? .reshape((b, t, self.num_heads, self.head_dim))? .transpose(1, 2)? .contiguous()?; - log::trace!("[MultiHeadAttention] V shape after projection and transpose: {:?}", v.shape()); - let (mean, min, max) = get_tensor_stats(&v)?; - log::debug!("[MultiHeadAttention] V stats: mean={}, min={}, max={}", mean, min, max); - - - log::trace!("[MultiHeadAttention] Q/K/V shape after projection and transpose: {:?}", q.shape()); let k_t = k.transpose(2, 3)?.contiguous()?; let mut scores = q.matmul(&k_t)? / (self.head_dim as f64).sqrt(); @@ -277,12 +233,7 @@ impl MultiHeadAttention { } }; - log::trace!("[MultiHeadAttention] Attention score shape: {:?}", scores.shape()); - let (mean, min, max) = get_tensor_stats(&scores)?; - log::debug!("[MultiHeadAttention] Attention score stats: mean={}, min={}, max={}", mean, min, max); - if let Some(mask) = mask { - log::trace!("[MultiHeadAttention] Applying mask"); let mask = mask.unsqueeze(1)?; let scale = Tensor::new(1e9f32, x.device())?; scores = match scores.broadcast_add(&mask.neg()?.mul(&scale)?) { @@ -301,8 +252,6 @@ impl MultiHeadAttention { return Err(e.into()); } }; - let (attn_mean, attn_min, attn_max) = get_tensor_stats(&attn)?; - log::debug!("[MultiHeadAttention] Attention stats: mean={}, min={}, max={}", attn_mean, attn_min, attn_max); let context = match attn.matmul(&v) { Ok(ctx) => ctx.transpose(1, 2)?.reshape((b, t, self.num_heads * self.head_dim))?, @@ -312,9 +261,6 @@ impl MultiHeadAttention { } }; - log::trace!("[MultiHeadAttention] Final context shape: {:?}", context.shape()); - let (mean, min, max) = get_tensor_stats(&context)?; - log::debug!("[MultiHeadAttention] Context stats: mean={}, min={}, max={}", mean, min, max); self.proj_out.forward(&context) } } diff --git a/crates/redeem-properties/src/models/model_interface.rs b/crates/redeem-properties/src/models/model_interface.rs index 9ea9273..ef68e24 100644 --- a/crates/redeem-properties/src/models/model_interface.rs +++ b/crates/redeem-properties/src/models/model_interface.rs @@ -2,10 +2,10 @@ use crate::{ building_blocks::featurize::{self, aa_indices_tensor, get_mod_features_from_parsed}, models::{ccs_model::CCSModelWrapper, ms2_model::MS2ModelWrapper, rt_model::RTModelWrapper}, utils::{ - data_handling::PeptideData, logging::Progress, peptdeep_utils::{ + data_handling::{PeptideData, RTNormalization}, logging::Progress, peptdeep_utils::{ get_modification_indices, get_modification_string, parse_instrument_index, remove_mass_shift, - }, stats::{compute_loss_stats, Metrics, TrainingPhase, TrainingStepMetrics}, utils::{CosineWithWarmup, LRScheduler} + }, stats::{compute_loss_stats, Metrics, TrainingPhase, TrainingStepMetrics}, utils::{get_tensor_stats, CosineWithWarmup, LRScheduler} }, }; use anyhow::{Context, Result}; @@ -13,7 +13,7 @@ use candle_core::{DType, Device, Tensor, Var}; use candle_nn::{Optimizer, VarMap}; use log::info; use rayon::prelude::*; -use std::ops::{Deref, Index}; +use std::{ops::{Deref, Index}, process::Output}; use std::path::Path; use std::{collections::HashMap, path::PathBuf}; @@ -293,7 +293,9 @@ pub trait ModelInterface: Send + Sync + ModelClone { let mod_feature_size = self.get_mod_element_count(); let mod_to_feature = self.get_mod_to_feature(); + log::trace!("[ModelInterface::encode_peptide] peptide_sequence: {}", peptide_sequence); let aa_tensor = aa_indices_tensor(peptide_sequence, device)?; + let (batch_size, seq_len, _) = aa_tensor.shape().dims3()?; let mod_names: Vec<&str> = mods.split(';').filter(|s| !s.is_empty()).collect(); @@ -343,9 +345,21 @@ pub trait ModelInterface: Send + Sync + ModelClone { } if features.len() == 1 { - Ok(features.remove(0)) + let output = features.remove(0); + let (mean, min, max) = get_tensor_stats(&output)?; + if !mean.is_finite() || !min.is_finite() || !max.is_finite() { + log::error!("For Peptide = {peptide_sequence} encode_peptides produced non-finite tensor stats: mean={mean}, min={min}, max={max}"); + anyhow::bail!("Non-finite values found in peptide encoding output."); + } + Ok(output) } else { - Ok(Tensor::cat(&features, 2)?) + let output = Tensor::cat(&features, 2)?; + let (mean, min, max) = get_tensor_stats(&output)?; + if !mean.is_finite() || !min.is_finite() || !max.is_finite() { + log::error!("For Peptide = {peptide_sequence} encode_peptides produced non-finite tensor stats: mean={mean}, min={min}, max={max}"); + anyhow::bail!("Non-finite values found in peptide encoding output."); + } + Ok(output) } } @@ -532,6 +546,12 @@ pub trait ModelInterface: Send + Sync + ModelClone { )); progress.inc(); + // If the loss is NaN, stop training and throw an error + if loss_val.is_nan() { + log::error!("Loss is NaN, stopping training."); + return Err(anyhow::anyhow!("Loss is NaN, stopping training.")); + } + Ok(()) }, )?; @@ -841,7 +861,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { (String, Option), crate::utils::peptdeep_utils::ModificationMap, >, - rt_norm_params: Option<(f32, f32)>, + rt_norm: RTNormalization, ) -> Result> { let num_batches = (inference_data.len() + batch_size - 1) / batch_size; info!( @@ -849,16 +869,16 @@ pub trait ModelInterface: Send + Sync + ModelClone { inference_data.len(), num_batches ); - + let progress = Progress::new(inference_data.len(), "[inference] Batch:"); let mut result: Vec> = vec![None; inference_data.len()]; - + inference_data .par_chunks(batch_size) .enumerate() .map(|(batch_idx, batch_data)| { let start_idx = batch_idx * batch_size; - + let peptides: Vec = batch_data .iter() .map(|p| remove_mass_shift(&p.sequence)) @@ -871,7 +891,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { .iter() .map(|p| get_modification_indices(&p.sequence)) .collect(); - + let charges = batch_data .iter() .filter_map(|p| p.charge) @@ -881,14 +901,14 @@ pub trait ModelInterface: Send + Sync + ModelClone { } else { None }; - + let nces = batch_data.iter().filter_map(|p| p.nce).collect::>(); let nces = if nces.len() == batch_data.len() { Some(nces) } else { None }; - + let instruments = batch_data .iter() .filter_map(|p| p.instrument.clone()) @@ -898,12 +918,12 @@ pub trait ModelInterface: Send + Sync + ModelClone { } else { None }; - + let input_tensor = self .encode_peptides(&peptides, &mods, &mod_sites, charges, nces, instruments)? .to_device(self.get_device())?; let output = self.forward(&input_tensor)?; - + match self.property_type() { PropertyType::RT | PropertyType::CCS => { let predictions = output.to_vec1()?; @@ -914,12 +934,11 @@ pub trait ModelInterface: Send + Sync + ModelClone { let mut peptide = batch_data[i].clone(); match self.property_type() { PropertyType::RT => { - peptide.retention_time = - if let Some((mean, std)) = rt_norm_params { - Some(pred * std + mean) - } else { - Some(pred) - }; + peptide.retention_time = Some(match rt_norm { + RTNormalization::ZScore(mean, std) => pred * std + mean, + RTNormalization::MinMax(min, max) => pred * (max - min) + min, + RTNormalization::None => pred, + }); } PropertyType::CCS => peptide.ion_mobility = Some(pred), _ => {} @@ -941,7 +960,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { result[idx] = Some(peptide); progress.inc(); }); - + progress.finish(); Ok(result.into_iter().flatten().collect()) } @@ -953,7 +972,9 @@ pub trait ModelInterface: Send + Sync + ModelClone { modifications: &HashMap<(String, Option), crate::utils::peptdeep_utils::ModificationMap>, ) -> Result<(Tensor, Tensor)> { let peptides: Vec = batch_data.par_iter().map(|p| remove_mass_shift(&p.sequence)).collect(); + let mods: Vec = batch_data.par_iter().map(|p| get_modification_string(&p.sequence, modifications)).collect(); + let mod_sites: Vec = batch_data.par_iter().map(|p| get_modification_indices(&p.sequence)).collect(); let charges = batch_data.par_iter().filter_map(|p| p.charge).collect::>(); From b6ec2a404fbfc882438259a1ceefc15f65c5a09f Mon Sep 17 00:00:00 2001 From: singjc Date: Mon, 12 May 2025 23:10:17 -0400 Subject: [PATCH 43/75] refactor: Update peptide data loading logic in redeem-cli crate --- .../src/properties/inference/inference.rs | 41 +++++++++++-------- crates/redeem-cli/src/properties/load_data.rs | 39 +++++++++++------- .../src/properties/train/trainer.rs | 2 + 3 files changed, 52 insertions(+), 30 deletions(-) diff --git a/crates/redeem-cli/src/properties/inference/inference.rs b/crates/redeem-cli/src/properties/inference/inference.rs index 9f642c8..e8caef3 100644 --- a/crates/redeem-cli/src/properties/inference/inference.rs +++ b/crates/redeem-cli/src/properties/inference/inference.rs @@ -2,22 +2,27 @@ use anyhow::{Context, Result}; use redeem_properties::models::ccs_cnn_lstm_model::CCSCNNLSTMModel; use redeem_properties::models::ccs_cnn_tf_model::CCSCNNTFModel; use redeem_properties::models::ccs_model::load_collision_cross_section_model; -use redeem_properties::models::rt_cnn_lstm_model::RTCNNLSTMModel; use redeem_properties::models::model_interface::ModelInterface; +use redeem_properties::models::rt_cnn_lstm_model::RTCNNLSTMModel; +use redeem_properties::models::rt_model::load_retention_time_model; use redeem_properties::utils::data_handling::{PeptideData, RTNormalization}; use redeem_properties::utils::peptdeep_utils::load_modifications; use redeem_properties::utils::utils::get_device; -use redeem_properties::models::rt_model::load_retention_time_model; -use crate::properties::load_data::load_peptide_data; -use crate::properties::util::write_bytes_to_file; use crate::properties::inference::input::PropertyInferenceConfig; use crate::properties::inference::output::write_peptide_data; +use crate::properties::load_data::load_peptide_data; +use crate::properties::util::write_bytes_to_file; pub fn run_inference(config: &PropertyInferenceConfig) -> Result<()> { - // Load inference data - let (inference_data, norm_factor) = load_peptide_data(&config.inference_data, Some(config.nce), Some(config.instrument.clone()), Some("min_max".to_string()))?; + let (inference_data, norm_factor) = load_peptide_data( + &config.inference_data, + &config.model_arch, + Some(config.nce), + Some(config.instrument.clone()), + Some("min_max".to_string()), + )?; log::info!("Loaded {} peptides", inference_data.len()); // Dispatch model training based on architecture @@ -44,14 +49,14 @@ pub fn run_inference(config: &PropertyInferenceConfig) -> Result<()> { device.clone(), )?), "ccs_cnn_lstm" => Box::new(CCSCNNLSTMModel::new( - &config.model_path, - None, - 0, - 8, - 4, - true, - device.clone(), - )?), + &config.model_path, + None, + 0, + 8, + 4, + true, + device.clone(), + )?), "ccs_cnn_tf" => Box::new(CCSCNNTFModel::new( &config.model_path, None, @@ -61,7 +66,12 @@ pub fn run_inference(config: &PropertyInferenceConfig) -> Result<()> { true, device.clone(), )?), - _ => return Err(anyhow::anyhow!("Unsupported RT model architecture: {}", model_arch)), + _ => { + return Err(anyhow::anyhow!( + "Unsupported RT model architecture: {}", + model_arch + )); + } }; let modifications = load_modifications().context("Failed to load modifications")?; @@ -76,7 +86,6 @@ pub fn run_inference(config: &PropertyInferenceConfig) -> Result<()> { )?; log::info!("Inference completed in {:?}", start_time.elapsed()); - log::info!("Predictions saved to: {}", config.output_file); write_peptide_data(&inference_results, &config.output_file)?; diff --git a/crates/redeem-cli/src/properties/load_data.rs b/crates/redeem-cli/src/properties/load_data.rs index 0646320..e29d239 100644 --- a/crates/redeem-cli/src/properties/load_data.rs +++ b/crates/redeem-cli/src/properties/load_data.rs @@ -11,6 +11,7 @@ use redeem_properties::utils::data_handling::{PeptideData, RTNormalization}; /// Returns both the peptide vector and optionally (mean, std) of retention times. pub fn load_peptide_data>( path: P, + model_arch: &str, nce: Option, instrument: Option, normalize_rt: Option, @@ -43,9 +44,12 @@ pub fn load_peptide_data>( .get(headers.iter().position(|h| h.to_lowercase() == "retention time").unwrap_or(3)) .and_then(|s| s.parse::().ok()); - let charge = record - .get(headers.iter().position(|h| h.to_lowercase() == "charge").unwrap_or(usize::MAX)) - .and_then(|s| s.parse::().ok()); + let charge = match model_arch { + "rt_cnn_lstm" | "rt_cnn_tf" => None, + _ => record + .get(headers.iter().position(|h| h.to_lowercase() == "charge").unwrap_or(usize::MAX)) + .and_then(|s| s.parse::().ok()), + }; let precursor_mass = record .get(headers.iter().position(|h| h.to_lowercase() == "precursor_mass").unwrap_or(usize::MAX)) @@ -59,17 +63,24 @@ pub fn load_peptide_data>( .get(headers.iter().position(|h| h.to_lowercase() == "ccs").unwrap_or(usize::MAX)) .and_then(|s| s.parse::().ok()); - let in_nce = nce.or_else(|| { - record - .get(headers.iter().position(|h| h.to_lowercase() == "nce").unwrap_or(usize::MAX)) - .and_then(|s| s.parse::().ok()) - }); - - let in_instrument = instrument.clone().or_else(|| { - record - .get(headers.iter().position(|h| h.to_lowercase() == "instrument").unwrap_or(usize::MAX)) - .map(|s| s.to_string()) - }); + let in_nce = match model_arch { + "ms2_bert" => nce.or_else(|| { + record + .get(headers.iter().position(|h| h.to_lowercase() == "nce").unwrap_or(usize::MAX)) + .and_then(|s| s.parse::().ok()) + }), + _ => None + + }; + + let in_instrument = match model_arch { + "ms2_bert" => instrument.clone().or_else(|| { + record + .get(headers.iter().position(|h| h.to_lowercase() == "instrument").unwrap_or(usize::MAX)) + .map(|s| s.to_string()) + }), + _ => None + }; if let Some(rt) = retention_time { rt_values.push(rt); diff --git a/crates/redeem-cli/src/properties/train/trainer.rs b/crates/redeem-cli/src/properties/train/trainer.rs index bee5988..6324e0c 100644 --- a/crates/redeem-cli/src/properties/train/trainer.rs +++ b/crates/redeem-cli/src/properties/train/trainer.rs @@ -27,6 +27,7 @@ pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { // Load training data let (train_peptides, norm_factor) = load_peptide_data( &config.train_data, + &config.model_arch, Some(config.nce), Some(config.instrument.clone()), Some(config.rt_normalization.clone().unwrap()), @@ -37,6 +38,7 @@ pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { let (val_peptides, _val_norm_factor) = if let Some(ref val_path) = config.validation_data { let (peptides, norm) = load_peptide_data( val_path, + &config.model_arch, Some(config.nce), Some(config.instrument.clone()), Some(config.rt_normalization.clone().unwrap()), From 32e117c6b88bcb27c6b5b1e1f655b83d1429b1dd Mon Sep 17 00:00:00 2001 From: singjc Date: Tue, 13 May 2025 00:39:49 -0400 Subject: [PATCH 44/75] fix: modication name and indice retrieval --- .../src/building_blocks/featurize.rs | 22 ++- .../src/models/ccs_cnn_tf_model.rs | 31 ++++ .../src/models/model_interface.rs | 29 ++- .../src/utils/peptdeep_utils.rs | 166 +++++++++++++++--- 4 files changed, 212 insertions(+), 36 deletions(-) diff --git a/crates/redeem-properties/src/building_blocks/featurize.rs b/crates/redeem-properties/src/building_blocks/featurize.rs index 751464e..f7628f2 100644 --- a/crates/redeem-properties/src/building_blocks/featurize.rs +++ b/crates/redeem-properties/src/building_blocks/featurize.rs @@ -26,8 +26,12 @@ pub fn aa_indices_tensor(seq: &str, device: &Device) -> Result { let map = aa_index_map(); let filtered: Vec = seq .chars() - .filter_map(|c| map.get(&c).copied()) - .collect(); + .map(|c| { + map.get(&c) + .copied() + .ok_or_else(|| anyhow!("Unknown amino acid character: '{}'", c)) + }) + .collect::>>()?; let mut indices = vec![0i64]; // padding start indices.extend(filtered); indices.push(0); // padding end @@ -43,6 +47,20 @@ pub fn aa_one_hot(aa_indices: &Tensor, cat_others: &[&Tensor]) -> Result let num_classes = AA_EMBEDDING_SIZE; let indices = aa_indices.to_vec2::()?; + + for (i, row) in indices.iter().enumerate() { + for (j, val) in row.iter().enumerate() { + if !val.is_finite() || *val < 0.0 || *val > (AA_EMBEDDING_SIZE as f32) { + log::error!( + "[aa_one_hot] Invalid index at batch {}, position {}: {}", + i, j, val + ); + } + } + } + + + let mut one_hot_data = vec![0.0f32; batch_size * seq_len * num_classes]; one_hot_data diff --git a/crates/redeem-properties/src/models/ccs_cnn_tf_model.rs b/crates/redeem-properties/src/models/ccs_cnn_tf_model.rs index ab41d5c..e1e0ff2 100644 --- a/crates/redeem-properties/src/models/ccs_cnn_tf_model.rs +++ b/crates/redeem-properties/src/models/ccs_cnn_tf_model.rs @@ -243,3 +243,34 @@ impl ModelInterface for CCSCNNTFModel { } +#[cfg(test)] +mod tests { + use super::*; + use crate::models::model_interface::ModelInterface; + use crate::models::ccs_cnn_tf_model::CCSCNNTFModel; + use candle_core::Device; + use std::path::PathBuf; + + + #[test] + fn test_encode_peptides() { + let device = Device::Cpu; + let model = Box::new(CCSCNNTFModel::new_untrained(device.clone()).unwrap()); + + let peptide_sequences = "AGHCEWQMKYR"; + let mods = "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M"; + let mod_sites = "0;4;8"; + let charge = Some(2); + let nce = Some(20); + let instrument = Some("QE"); + + let result = + model.encode_peptide(&peptide_sequences, mods, mod_sites, charge, nce, instrument); + + println!("{:?}", result); + + // assert!(result.is_ok()); + // let encoded_peptides = result.unwrap(); + // assert_eq!(encoded_peptides.shape().dims2().unwrap(), (1, 27 + 109 + 1 + 1 + 1)); + } +} \ No newline at end of file diff --git a/crates/redeem-properties/src/models/model_interface.rs b/crates/redeem-properties/src/models/model_interface.rs index ef68e24..5f41e76 100644 --- a/crates/redeem-properties/src/models/model_interface.rs +++ b/crates/redeem-properties/src/models/model_interface.rs @@ -293,7 +293,8 @@ pub trait ModelInterface: Send + Sync + ModelClone { let mod_feature_size = self.get_mod_element_count(); let mod_to_feature = self.get_mod_to_feature(); - log::trace!("[ModelInterface::encode_peptide] peptide_sequence: {}", peptide_sequence); + log::trace!("[ModelInterface::encode_peptide] peptide_sequence: {} | mods: {} | mod_sites: {} | charge: {:?} | nce: {:?} | instrument: {:?}", peptide_sequence, mods, mod_sites, charge, nce, instrument); + let aa_tensor = aa_indices_tensor(peptide_sequence, device)?; let (batch_size, seq_len, _) = aa_tensor.shape().dims3()?; @@ -401,20 +402,36 @@ pub trait ModelInterface: Send + Sync + ModelClone { .max() .unwrap_or(0); + // Consistency check for feature dimension + let expected_feat_dim = tensors + .get(0) + .ok_or_else(|| anyhow::anyhow!("Empty input batch"))? + .shape() + .dims3()? + .2; + let padded = tensors .into_par_iter() .map(|t| { let (_, seq_len, feat_dim) = t.shape().dims3()?; + + // Check that all tensors have the same feature dimension + if feat_dim != expected_feat_dim { + return Err(anyhow::anyhow!( + "Inconsistent feature dim: expected {}, got {}", + expected_feat_dim, + feat_dim + )); + } + if seq_len < max_len { - let pad = - Tensor::zeros(&[1, max_len - seq_len, feat_dim], t.dtype(), t.device())?; - Tensor::cat(&[&t, &pad], 1) + let pad = Tensor::zeros(&[1, max_len - seq_len, feat_dim], t.dtype(), t.device())?; + Ok(Tensor::cat(&[&t, &pad], 1)?) } else { Ok(t) } }) - .map(|res| res.map_err(anyhow::Error::from)) - .collect::, _>>()?; + .collect::, anyhow::Error>>()?; Ok(Tensor::cat(&padded, 0)?) } diff --git a/crates/redeem-properties/src/utils/peptdeep_utils.rs b/crates/redeem-properties/src/utils/peptdeep_utils.rs index 3a6702e..dfdfcc5 100644 --- a/crates/redeem-properties/src/utils/peptdeep_utils.rs +++ b/crates/redeem-properties/src/utils/peptdeep_utils.rs @@ -343,12 +343,26 @@ pub fn extract_masses_and_indices(peptide: &str) -> Vec<(f64, usize)> { pub fn get_modification_indices(peptide: &str) -> String { let re = Regex::new(r"(\[.*?\]|\(UniMod:\d+\)|\([a-zA-Z]+\))").unwrap(); let mut indices = Vec::new(); - let mut offset = 1; // Offset by 1 for 0-based index + let mut offset = 0; + let mut aa_index = 0; + let mut i = 0; + + while i < peptide.len() { + let c = peptide[i..].chars().next().unwrap(); + + if c == '[' || c == '(' { + if let Some(mat) = re.find_at(peptide, i) { + if mat.start() == i { + // If the modification is at the beginning (i == 0), it's on the N-term + indices.push(aa_index.to_string()); + i = mat.end(); + continue; + } + } + } - for mat in re.find_iter(peptide) { - let index = mat.start().saturating_sub(offset); - indices.push(index.to_string()); - offset += mat.end() - mat.start(); + aa_index += 1; + i += c.len_utf8(); } indices.join(";") @@ -403,27 +417,60 @@ pub fn extract_unimod_annotations(peptide: &str) -> Vec<(String, usize)> { let re_unimod = Regex::new(r"\(UniMod:(\d+)\)").unwrap(); let mut results = Vec::new(); let mut offset = 0; + let mut aa_index = 0; let mut idx = 0; while idx < peptide.len() { if let Some(mat) = re_unimod.find_at(peptide, idx) { if mat.start() == idx { + // UniMod annotation let cap = re_unimod.captures(&peptide[idx..mat.end()]).unwrap(); let unimod_str = format!("UniMod:{}", &cap[1]); - let pos = idx - offset; - results.push((unimod_str, pos)); + results.push((unimod_str, aa_index)); offset += mat.end() - mat.start(); idx = mat.end(); continue; } } - idx += peptide[idx..].chars().next().unwrap().len_utf8(); + + // Only increment aa_index on actual amino acid + let ch = peptide[idx..].chars().next().unwrap(); + if ch.is_alphabetic() { + aa_index += 1; + } + idx += ch.len_utf8(); } results } +/// Extracts either mass shift or UniMod annotations from a peptide string, +/// returning a vector of (mod_str, position). +/// +/// Dispatches to `extract_mass_annotations` if it finds `[+mass]`, +/// or to `extract_unimod_annotations` if it finds `(UniMod:id)`. +/// +/// # Example +/// ``` +/// let mass = extract_mod_annotations("AC[+57.0215]DE"); +/// assert_eq!(mass, vec![("57.0215".to_string(), 2)]); +/// +/// let unimod = extract_mod_annotations("AC(UniMod:4)DE"); +/// assert_eq!(unimod, vec![("UniMod:4".to_string(), 2)]); +/// ``` +pub fn extract_mod_annotations(peptide: &str) -> Vec<(String, usize)> { + if peptide.contains("[+") || peptide.contains("[-") { + extract_mass_annotations(peptide) + } else if peptide.contains("(UniMod:") { + extract_unimod_annotations(peptide) + } else { + Vec::new() + } +} + + + /// Attempts to look up a modification name from a map using the provided key and amino acid. /// Falls back to a key with `None` if the exact amino acid is not matched. /// @@ -438,16 +485,17 @@ pub fn extract_unimod_annotations(peptide: &str) -> Vec<(String, usize)> { /// ``` pub fn lookup_modification( key: String, - aa: char, + aa: Option, map: &HashMap<(String, Option), ModificationMap>, ) -> Option { - map.get(&(key.clone(), Some(aa))) + map.get(&(key.clone(), aa)) .or_else(|| map.get(&(key, None))) .map(|m| m.name.clone()) } + /// Generates a standardized modification string (e.g., "Carbamidomethyl@C") /// for a peptide sequence based on mass shifts (e.g., `[+57.0215]`) or /// UniMod annotations (e.g., `(UniMod:4)`), using a preloaded modification map. @@ -484,22 +532,43 @@ pub fn get_modification_string( modification_map: &HashMap<(String, Option), ModificationMap>, ) -> String { let naked_peptide = remove_mass_shift(peptide); - let mut found_mods = Vec::new(); - for (key, pos) in extract_mass_annotations(peptide) + extract_mod_annotations(peptide) .into_iter() - .chain(extract_unimod_annotations(peptide)) - { - let aa = naked_peptide.chars().nth(pos.saturating_sub(1)).unwrap_or('\0'); - if let Some(name) = lookup_modification(key, aa, modification_map) { - found_mods.push(name); - } - } - - found_mods.join(";") + .filter_map(|(key, pos)| { + let aa_opt = if pos == 0 { + naked_peptide.chars().next() + } else { + naked_peptide.chars().nth(pos - 1) + }; + + // Try normal lookup first + let mod_str = lookup_modification(key.clone(), aa_opt, modification_map); + + // If not found and it's a terminal mod, look for Protein_N-term + if mod_str.is_none() && pos == 0 { + // Try all entries with same key and look for *_N-term + let fallback = modification_map + .iter() + .find_map(|((k, _), v)| { + if k == &key && (v.name.contains("Protein_N-term") || v.name.contains("Any_N-term")) { + Some(v.name.clone()) + } else { + None + } + }); + fallback + } else { + mod_str + } + }) + .collect::>() + .join(";") } + + // TODO: Derive from PeptDep constants yaml const IM_GAS_MASS: f64 = 28.0; const CCS_IM_COEF: f64 = 1059.62245; @@ -573,6 +642,44 @@ mod tests { assert!(result.is_ok()); } + #[test] + fn test_extract_unimod_annotations() { + let peptide = "AC(UniMod:4)DE(UniMod:7)FG"; + let result = extract_unimod_annotations(peptide); + println!("Peptide: {}, Result: {:?}", peptide, result); + assert_eq!(result, vec![("UniMod:4".to_string(), 2), ("UniMod:7".to_string(), 4)]); + + let peptide = "AC(UniMod:4)DE(UniMod:7)FG(UniMod:10)"; + let result = extract_unimod_annotations(peptide); + println!("Peptide: {}, Result: {:?}", peptide, result); + assert_eq!( + result, + vec![ + ("UniMod:4".to_string(), 2), + ("UniMod:7".to_string(), 4), + ("UniMod:10".to_string(), 6) + ] + ); + + let peptide = "(UniMod:1)M(UniMod:35)AAAATMAAAAR"; + let result = extract_unimod_annotations(peptide); + println!("Peptide: {}, Result: {:?}", peptide, result); + assert_eq!(result, vec![("UniMod:1".to_string(), 0), ("UniMod:35".to_string(), 1)]); + } + + #[test] + fn test_extract_mod_annotations() { + let peptide = "[+42.0105]M[+15.9949]AAAATMAAAAR"; + let result = extract_mod_annotations(peptide); + println!("Peptide: {}, Result: {:?}", peptide, result); + assert_eq!(result, vec![("42.0105".to_string(), 0), ("15.9949".to_string(), 1)]); + + let peptide = "(UniMod:1)M(UniMod:35)AAAATMAAAAR"; + let result = extract_mod_annotations(peptide); + println!("Peptide: {}, Result: {:?}", peptide, result); + assert_eq!(result, vec![("UniMod:1".to_string(), 0), ("UniMod:35".to_string(), 1)]); + } + #[test] fn test_get_modification_indices() { // Compile the regex once for all tests @@ -581,13 +688,14 @@ mod tests { // Test cases let test_cases = vec![ ("PEPTIDE", ""), - ("PEPT[+15.9949]IDE", "3"), - ("P[+15.9949]EPT[+79.99]IDE", "0;3"), - ("TVQSLEIDLDSM[+15.9949]R", "11"), - ("TVQS[+79.99]LEIDLDSM[+15.9949]R", "3;11"), + ("PEPT[+15.9949]IDE", "4"), + ("P[+15.9949]EPT[+79.99]IDE", "1;4"), + ("TVQSLEIDLDSM[+15.9949]R", "12"), + ("TVQS[+79.99]LEIDLDSM[+15.9949]R", "4;12"), ("[+42.0106]PEPTIDE", "0"), - ("PEPTIDE[+42.0106]", "6"), - ("P[+15.9949]EP[+79.99]T[+15.9949]IDE", "0;2;3"), + ("PEPTIDE[+42.0106]", "7"), + ("P[+15.9949]EP[+79.99]T[+15.9949]IDE", "1;3;4"), + ("(UniMod:1)M(UniMod:35)AAAATMAAAAR", "0;1"), ]; for (peptide, expected) in test_cases { @@ -630,9 +738,11 @@ mod tests { ("P[+15.9949]EPT[+79.9663]IDE", "Oxidation@P;Phospho@T"), ("TVQSLEIDLDSM[+15.9949]R", "Oxidation@M"), ("TVQS[+79.9663]LEIDLDSM[+15.9949]R", "Phospho@S;Oxidation@M"), - ("[+42.0106]PEPTIDE", "Acetyl@Protein_N-term"), + ("(UniMod:1)M(UniMod:35)AAAATMAAAAR", "Any_N-term;Oxidation@M"), + ("[+42.0106]PEPTIDE", "Any_N-term"), ("PEPTIDE[+42.0106]", ""), ("P[+15.9949]EP[+79.9663]T[+15.9949]IDE", "Oxidation@P;Oxidation@T"), + ("(UniMod:1)M(UniMod:35)AAAATMAAAAR", "Any_N-term;Oxidation@M"), ]; From f0354a8bb654ffbf0a487780cc445a93f07ee8df Mon Sep 17 00:00:00 2001 From: singjc Date: Tue, 13 May 2025 10:33:10 -0400 Subject: [PATCH 45/75] refactor: Update PeptideData struct to use u8 for string fields --- .../src/properties/inference/inference.rs | 5 +- .../src/properties/inference/output.rs | 30 +- crates/redeem-cli/src/properties/load_data.rs | 49 +- .../src/properties/train/trainer.rs | 8 +- .../examples/alphapeptdeep_ccs_cnn_lstm.rs | 143 ++---- .../examples/alphapeptdeep_ms2_bert.rs | 327 +++++-------- .../examples/alphapeptdeep_rt_cnn_lstm.rs | 237 +++------- .../src/models/ccs_cnn_lstm_model.rs | 6 +- .../redeem-properties/src/models/ccs_model.rs | 2 +- .../src/models/model_interface.rs | 433 +++++++++--------- .../src/models/ms2_bert_model.rs | 10 +- .../redeem-properties/src/models/ms2_model.rs | 2 +- .../src/models/rt_cnn_lstm_model.rs | 59 +-- .../redeem-properties/src/models/rt_model.rs | 2 +- .../src/utils/data_handling.rs | 118 ++++- 15 files changed, 656 insertions(+), 775 deletions(-) diff --git a/crates/redeem-cli/src/properties/inference/inference.rs b/crates/redeem-cli/src/properties/inference/inference.rs index e8caef3..5fc6a91 100644 --- a/crates/redeem-cli/src/properties/inference/inference.rs +++ b/crates/redeem-cli/src/properties/inference/inference.rs @@ -15,6 +15,8 @@ use crate::properties::load_data::load_peptide_data; use crate::properties::util::write_bytes_to_file; pub fn run_inference(config: &PropertyInferenceConfig) -> Result<()> { + let modifications = load_modifications().context("Failed to load modifications")?; + // Load inference data let (inference_data, norm_factor) = load_peptide_data( &config.inference_data, @@ -22,6 +24,7 @@ pub fn run_inference(config: &PropertyInferenceConfig) -> Result<()> { Some(config.nce), Some(config.instrument.clone()), Some("min_max".to_string()), + &modifications, )?; log::info!("Loaded {} peptides", inference_data.len()); @@ -74,8 +77,6 @@ pub fn run_inference(config: &PropertyInferenceConfig) -> Result<()> { } }; - let modifications = load_modifications().context("Failed to load modifications")?; - let start_time = std::time::Instant::now(); model.set_evaluation_mode(); let inference_results: Vec = model.inference( diff --git a/crates/redeem-cli/src/properties/inference/output.rs b/crates/redeem-cli/src/properties/inference/output.rs index 2e91c97..0922834 100644 --- a/crates/redeem-cli/src/properties/inference/output.rs +++ b/crates/redeem-cli/src/properties/inference/output.rs @@ -1,11 +1,13 @@ use std::fs::File; -use std::io::{BufWriter, Write}; +use std::io::BufWriter; use anyhow::{Result, Context}; use std::path::Path; use redeem_properties::utils::data_handling::PeptideData; /// Write a vector of PeptideData to a CSV or TSV file based on file extension. pub fn write_peptide_data>(data: &[PeptideData], output_path: P) -> Result<()> { + + let path = output_path.as_ref(); let extension = path.extension().and_then(|ext| ext.to_str()).unwrap_or("csv"); let delimiter = match extension { @@ -19,7 +21,20 @@ pub fn write_peptide_data>(data: &[PeptideData], output_path: P) .from_writer(BufWriter::new(file)); // Write headers - writer.write_record(&["sequence", "charge", "nce", "instrument", "retention_time", "ion_mobility", "ms2_intensities"])?; + writer.write_record(&[ + "modified_sequence", + "naked_sequence", + "mods", + "mod_sites", + "charge", + "precursor_mass", + "nce", + "instrument", + "retention_time", + "ion_mobility", + "ccs", + "ms2_intensities", + ])?; for entry in data { let ms2_str = entry.ms2_intensities.as_ref() @@ -31,16 +46,21 @@ pub fn write_peptide_data>(data: &[PeptideData], output_path: P) .unwrap_or_default(); writer.write_record(&[ - &entry.sequence, + entry.modified_sequence_str(), + entry.naked_sequence_str(), + entry.mods_str(), + entry.mod_sites_str(), &entry.charge.map_or(String::new(), |c| c.to_string()), + &entry.precursor_mass.map_or(String::new(), |m| format!("{:.4}", m)), &entry.nce.map_or(String::new(), |n| n.to_string()), - &entry.instrument.clone().unwrap_or_default(), + &entry.instrument_str().unwrap_or_default().to_string(), &entry.retention_time.map_or(String::new(), |r| format!("{:.4}", r)), &entry.ion_mobility.map_or(String::new(), |im| format!("{:.4}", im)), + &entry.ccs.map_or(String::new(), |c| format!("{:.4}", c)), &ms2_str, ])?; } writer.flush()?; Ok(()) -} \ No newline at end of file +} diff --git a/crates/redeem-cli/src/properties/load_data.rs b/crates/redeem-cli/src/properties/load_data.rs index e29d239..eac4717 100644 --- a/crates/redeem-cli/src/properties/load_data.rs +++ b/crates/redeem-cli/src/properties/load_data.rs @@ -1,9 +1,12 @@ +use std::{collections::HashMap, sync::Arc}; use std::fs::File; use std::path::Path; use std::io::BufReader; use anyhow::{Result, Context}; use csv::ReaderBuilder; -use redeem_properties::utils::data_handling::{PeptideData, RTNormalization}; +use redeem_properties::utils::peptdeep_utils::{get_modification_indices, get_modification_string, ModificationMap}; +use redeem_properties::utils::{data_handling::{PeptideData, RTNormalization}, peptdeep_utils::remove_mass_shift}; + /// Load peptide training data from a CSV or TSV file and optionally normalize RT. @@ -15,6 +18,7 @@ pub fn load_peptide_data>( nce: Option, instrument: Option, normalize_rt: Option, + modifications: &HashMap<(String, Option), ModificationMap>, ) -> Result<(Vec, RTNormalization)> { let file = File::open(&path) .with_context(|| format!("Failed to open file: {:?}", path.as_ref()))?; @@ -35,10 +39,21 @@ pub fn load_peptide_data>( for result in rdr.records() { let record = result?; - let sequence = record - .get(headers.iter().position(|h| h.to_lowercase() == "sequence").unwrap_or(2)) - .unwrap_or("") - .to_string(); + let sequence_bytes: Arc<[u8]> = Arc::from( + record + .get(headers.iter().position(|h| h.to_lowercase() == "sequence").unwrap_or(2)) + .unwrap_or("") + .as_bytes() + .to_vec() + .into_boxed_slice(), + ); + + let sequence_str = String::from_utf8_lossy(&sequence_bytes); + + let naked_sequence = Arc::from(remove_mass_shift(&sequence_str).as_bytes().to_vec().into_boxed_slice()); + + let mods: Arc<[u8]> = Arc::from(get_modification_string(&sequence_str, modifications).into_bytes().into_boxed_slice()); + let mod_sites: Arc<[u8]> = Arc::from(get_modification_indices(&sequence_str).into_bytes().into_boxed_slice()); let retention_time = record .get(headers.iter().position(|h| h.to_lowercase() == "retention time").unwrap_or(3)) @@ -69,25 +84,31 @@ pub fn load_peptide_data>( .get(headers.iter().position(|h| h.to_lowercase() == "nce").unwrap_or(usize::MAX)) .and_then(|s| s.parse::().ok()) }), - _ => None - + _ => None, }; let in_instrument = match model_arch { - "ms2_bert" => instrument.clone().or_else(|| { - record - .get(headers.iter().position(|h| h.to_lowercase() == "instrument").unwrap_or(usize::MAX)) - .map(|s| s.to_string()) - }), - _ => None + "ms2_bert" => instrument + .as_ref() + .map(|s| Arc::from(s.as_bytes().to_vec().into_boxed_slice())) + .or_else(|| { + record + .get(headers.iter().position(|h| h.to_lowercase() == "instrument").unwrap_or(usize::MAX)) + .map(|s| Arc::from(s.as_bytes().to_vec().into_boxed_slice())) + }), + _ => None, }; + if let Some(rt) = retention_time { rt_values.push(rt); } peptides.push(PeptideData { - sequence, + modified_sequence: sequence_bytes, + naked_sequence, + mods, + mod_sites, charge, precursor_mass, nce: in_nce, diff --git a/crates/redeem-cli/src/properties/train/trainer.rs b/crates/redeem-cli/src/properties/train/trainer.rs index 6324e0c..2c6c657 100644 --- a/crates/redeem-cli/src/properties/train/trainer.rs +++ b/crates/redeem-cli/src/properties/train/trainer.rs @@ -24,6 +24,9 @@ use load_data::load_peptide_data; use super::input; pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { + log::trace!("Loading modifications map"); + let modifications = load_modifications().context("Failed to load modifications")?; + // Load training data let (train_peptides, norm_factor) = load_peptide_data( &config.train_data, @@ -31,6 +34,7 @@ pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { Some(config.nce), Some(config.instrument.clone()), Some(config.rt_normalization.clone().unwrap()), + &modifications, )?; log::info!("Loaded {} training peptides", train_peptides.len()); @@ -42,6 +46,7 @@ pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { Some(config.nce), Some(config.instrument.clone()), Some(config.rt_normalization.clone().unwrap()), + &modifications, ) .context("Failed to load validation data")?; (Some(peptides), Some(norm)) @@ -128,9 +133,6 @@ pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { log::trace!("Model loaded successfully"); - log::trace!("Loading modifications map"); - let modifications = load_modifications().context("Failed to load modifications")?; - let start_time = std::time::Instant::now(); log::trace!("Training started"); let train_step_metrics = model.train( diff --git a/crates/redeem-properties/examples/alphapeptdeep_ccs_cnn_lstm.rs b/crates/redeem-properties/examples/alphapeptdeep_ccs_cnn_lstm.rs index 5b7f52b..0236f0a 100644 --- a/crates/redeem-properties/examples/alphapeptdeep_ccs_cnn_lstm.rs +++ b/crates/redeem-properties/examples/alphapeptdeep_ccs_cnn_lstm.rs @@ -2,28 +2,31 @@ use anyhow::{Context, Result}; use candle_core::Device; use redeem_properties::{ models::{ - model_interface::{ModelInterface, PredictionResult}, ccs_cnn_lstm_model::CCSCNNLSTMModel, + model_interface::{ModelInterface, PredictionResult}, + }, + utils::{ + data_handling::{PeptideData}, + peptdeep_utils::{ion_mobility_to_ccs_bruker}, }, - utils::{data_handling::PeptideData, peptdeep_utils::{load_modifications, ccs_to_mobility_bruker, ion_mobility_to_ccs_bruker}}, }; -use std::path::PathBuf; +use std::{path::PathBuf, sync::Arc}; -struct PredictionContext { - peptides: Vec, - mods: Vec, - mod_sites: Vec, +struct PredictionContext<'a> { + peptides: Vec<&'a str>, + mods: Vec<&'a str>, + mod_sites: Vec<&'a str>, charges: Vec, observed_ccs: Vec, } -impl PredictionContext { - fn new(test_peptides: &Vec<(&str, &str, &str, i32, f32)>) -> Self { - let peptides: Vec = test_peptides.iter().map(|(pep, _, _, _, _)| pep.to_string()).collect(); - let mods: Vec = test_peptides.iter().map(|(_, mod_, _, _, _)| mod_.to_string()).collect(); - let mod_sites: Vec = test_peptides.iter().map(|(_, _, sites, _, _)| sites.to_string()).collect(); - let charges: Vec = test_peptides.iter().map(|(_, _, _, charge, _)| *charge).collect(); - let observed_ccs: Vec = test_peptides.iter().map(|(_, _, _, _, ccs)| *ccs).collect(); +impl<'a> PredictionContext<'a> { + fn new(test_peptides: &'a [(&'a str, &'a str, &'a str, i32, f32)]) -> Self { + let peptides = test_peptides.iter().map(|(pep, _, _, _, _)| *pep).collect(); + let mods = test_peptides.iter().map(|(_, m, _, _, _)| *m).collect(); + let mod_sites = test_peptides.iter().map(|(_, _, s, _, _)| *s).collect(); + let charges = test_peptides.iter().map(|(_, _, _, c, _)| *c).collect(); + let observed_ccs = test_peptides.iter().map(|(_, _, _, _, ccs)| *ccs).collect(); Self { peptides, @@ -35,90 +38,50 @@ impl PredictionContext { } } -fn run_prediction(model: &mut CCSCNNLSTMModel, prediction_context: &PredictionContext) -> Result<()> { +fn run_prediction(model: &mut CCSCNNLSTMModel, ctx: &PredictionContext) -> Result<()> { match model.predict( - &prediction_context.peptides, - &prediction_context.mods, - &prediction_context.mod_sites, - Some(prediction_context.charges.clone()), + &ctx.peptides, + &ctx.mods, + &ctx.mod_sites, + Some(ctx.charges.clone()), None, None, - ) { - Ok(predictions) => { - if let PredictionResult::CCSResult(ccs_preds) = predictions { - let total_error: f32 = ccs_preds - .iter() - .zip(prediction_context.observed_ccs.iter()) - .map(|(pred, obs)| (pred - obs).abs()) - .sum(); - - print_predictions(&prediction_context.peptides, &ccs_preds, &prediction_context.observed_ccs); - - let mean_absolute_error = total_error / ccs_preds.len() as f32; - println!("Mean Absolute Error: {:.6}", mean_absolute_error); - } else { - println!("Unexpected prediction result type."); + )? { + PredictionResult::CCSResult(preds) => { + let total_error: f32 = preds + .iter() + .zip(ctx.observed_ccs.iter()) + .map(|(pred, obs)| (pred - obs).abs()) + .sum(); + + for (pep, pred, obs) in itertools::izip!(&ctx.peptides, &preds, &ctx.observed_ccs) { + println!("Peptide: {}, Predicted CCS: {:.4}, Observed CCS: {:.4}", pep, pred, obs); } - } - Err(e) => { - println!("Error during batch prediction: {:?}", e); - } - } - Ok(()) -} -fn print_predictions(peptides: &[String], ccs_preds: &[f32], observed_ccs: &[f32]) { // Changed - let mut peptides_iter = peptides.iter(); - let mut ccs_preds_iter = ccs_preds.iter(); // Changed - let mut observed_ccs_iter = observed_ccs.iter(); // Changed - - loop { - match (peptides_iter.next(), ccs_preds_iter.next(), observed_ccs_iter.next()) { - (Some(pep), Some(pred), Some(obs)) => { - println!("Peptide: {}, Predicted CCS: {}, Observed CCS: {}", pep, pred, obs); // Changed - } - _ => break, // Exit the loop if any iterator is exhausted + let mae = total_error / preds.len() as f32; + println!("Mean Absolute Error: {:.6}", mae); } + _ => println!("Unexpected prediction result type."), } + Ok(()) } fn main() -> Result<()> { let model_path = PathBuf::from("data/models/alphapeptdeep/generic/ccs.pth"); let constants_path = PathBuf::from("data/models/alphapeptdeep/generic/ccs.pth.model_const.yaml"); - - // let device use cuda if available otherwise use cpu let device = Device::new_cuda(0).unwrap_or(Device::Cpu); println!("Device: {:?}", device); - let mut model = CCSCNNLSTMModel::new(&model_path, Some(&constants_path), 0, 8, 4, true, device) - .context("Failed to create CCSCNNLSTMModel")?; - - // Define training data - let training_data = vec![ - PeptideData::new("EHVIIQAEFYLNPDQ", Some(2), None, None, None, Some(1.10), None), - PeptideData::new("KTLTGKTITLEVEPS", Some(2), None, None, None, Some(1.04), None), - PeptideData::new("SLLAQNTSWLL", Some(1), None, None, None, Some(1.67), None), - PeptideData::new("SLQEVAM[+15.9949]FL", Some(1), None, None, None, Some(1.53), None), - PeptideData::new("VLADQVWTL", Some(2), None, None, None, Some(0.839), None), - PeptideData::new("LLMEPGAMRFL", Some(2), None, None, None, Some(0.949), None), - PeptideData::new("SGEIKIAYTYSVS", Some(2), None, None, None, Some(0.974), None), - PeptideData::new("HTEIVFARTSPQQKL", Some(2), None, None, None, Some(1.13), None), - PeptideData::new("SM[+15.9949]ADIPLGFGV", Some(1), None, None, None, Some(1.59), None), - PeptideData::new("KLIDHQGLYL", Some(2), None, None, None, Some(0.937), None), - ]; - - // Sequence Monoisotopic Mass (Da) Charge m/z - // SKEEETSIDVAGKP 1488.7308 2 745.3727 - // LPILVPSAKKAIYM 1542.9208 2 772.4677 - // RTPKIQVYSRHPAE 1680.906 3 561.3093 - // EEVQIDILDTAGQE 1558.7362 2 780.3754 - // GAPLVKPLPVNPTDPA 1584.8875 2 793.4511 - // FEDENFILK 1153.5655 2 577.7901 - // YPSLPAQQV 1001.5182 1 1002.5255 - // YLPPATQVV 986.5437 2 494.2792 - // YISPDQLADLYK 1424.7187 2 713.3667 - // PSIVRLLQCDPSSAGQF 1816.9142 2 909.4644 + let mut model = CCSCNNLSTMModel::new( + &model_path, + Some(&constants_path), + 0, + 8, + 4, + true, + device, + )?; let test_peptides = vec![ ("SKEEETSIDVAGKP", "", "", 2, ion_mobility_to_ccs_bruker(0.998, 2, 745.3727)), @@ -133,20 +96,8 @@ fn main() -> Result<()> { ("PSIVRLLQCDPSSAGQF", "", "", 2, ion_mobility_to_ccs_bruker(1.10, 2, 909.4644)), ]; - let prediction_context = PredictionContext::new(&test_peptides); - - run_prediction(&mut model, &prediction_context)?; - - // Fine-tune the model - let modifications = load_modifications().context("Failed to load modifications")?; - let learning_rate = 0.001; - let epochs = 5; - model - .fine_tune(&training_data, modifications, 10, learning_rate, epochs) - .context("Failed to fine-tune the model")?; - - // Test prediction again with a few peptides after fine-tuning - run_prediction(&mut model, &prediction_context)?; + let ctx = PredictionContext::new(&test_peptides); + run_prediction(&mut model, &ctx)?; Ok(()) } diff --git a/crates/redeem-properties/examples/alphapeptdeep_ms2_bert.rs b/crates/redeem-properties/examples/alphapeptdeep_ms2_bert.rs index b3ee4b1..cf7b8f7 100644 --- a/crates/redeem-properties/examples/alphapeptdeep_ms2_bert.rs +++ b/crates/redeem-properties/examples/alphapeptdeep_ms2_bert.rs @@ -6,245 +6,142 @@ use redeem_properties::{ model_interface::{ModelInterface, PredictionResult}, ms2_bert_model::MS2BertModel, }, - utils::{data_handling::PeptideData, peptdeep_utils::{get_modification_indices, get_modification_string, load_modifications, remove_mass_shift, ModificationMap}}, + utils::{ + data_handling::{PeptideData, PeptideBatchData}, + peptdeep_utils::{get_modification_indices, get_modification_string, load_modifications, remove_mass_shift, ModificationMap}, + }, }; use std::{ - collections::HashMap, fs::File, path::PathBuf + collections::HashMap, fs::File, path::PathBuf, sync::Arc }; -struct PredictionContext { - peptides: Vec, - naked_peptides: Vec, - mods: Vec, - mod_sites: Vec, - charges: Vec, - nces: Vec, - instruments: Vec, - ms2_intensities: Vec>>, -} - -impl PredictionContext { - fn new(training_data: &Vec, modification_map: &HashMap<(String, Option), ModificationMap>) -> Self { - let peptides: Vec = training_data.iter().map(|p| p.sequence.clone()).collect(); - - let naked_peptides: Vec = training_data.iter().map(|p| remove_mass_shift(&p.sequence)).collect(); - let naked_peptides: Vec = naked_peptides.iter().map(|p| p.trim_start_matches("-").to_string()).collect(); - - - // Get mod_str with get_modification_string - let mod_strs: Vec = training_data.iter().map(|p| get_modification_string(&p.sequence, modification_map)).collect(); - - /// Get modification indices with get_modification_indices - let mod_sites: Vec = training_data.iter().map(|p| get_modification_indices(&p.sequence)).collect(); - - - let charges: Vec = training_data.iter().map(|p| p.charge.unwrap()).collect(); - let nces: Vec = training_data.iter().map(|p| p.nce.unwrap()).collect(); - let instruments: Vec = training_data.iter().map(|p| p.instrument.clone().unwrap()).collect(); - let ms2_intensities: Vec>> = training_data.iter().map(|p| p.ms2_intensities.clone().unwrap()).collect(); - - Self { - peptides, - naked_peptides, - mods: mod_strs, - mod_sites, - charges, - nces, - instruments, - ms2_intensities, - } - } -} - -fn run_prediction(model: &mut MS2BertModel, prediction_context: &PredictionContext) -> Result<()> { // Changed Model - match model.predict( - &prediction_context.naked_peptides, - &prediction_context.mods, - &prediction_context.mod_sites, - Some(prediction_context.charges.clone()), - Some(prediction_context.nces.clone()), - Some(prediction_context.instruments.clone()), - ) { - Ok(predictions) => { - if let PredictionResult::MS2Result(ms2_preds) = predictions { - let total_error: f32 = ms2_preds - .iter() - .zip(prediction_context.ms2_intensities.iter()) - .map(|(outer_pred, outer_obs)| { - outer_pred - .iter() - .zip(outer_obs.iter()) - .map(|(inner_pred, inner_obs)| { - inner_pred - .iter() - .zip(inner_obs.iter()) - .map(|(pred, obs)| (pred - obs).abs()) - .sum::() // Sum the innermost differences - }) - .sum::() // Sum the differences from the middle vectors +fn run_prediction(model: &mut MS2BertModel, batch_data: &[PeptideData]) -> Result<()> { + let batch = PeptideBatchData::from(batch_data); + + let peptides = batch.naked_sequence_strs(); + let mods = batch.mods_strs(); + let mod_sites = batch.mod_sites_strs(); + + let charges = if batch.charges.iter().all(|c| c.is_some()) { + Some(batch.charges.iter().map(|c| c.unwrap()).collect()) + } else { + None + }; + let nces = if batch.nces.iter().all(|n| n.is_some()) { + Some(batch.nces.iter().map(|n| n.unwrap()).collect()) + } else { + None + }; + let instruments = if batch.instruments.iter().all(|i| i.is_some()) { + let flat: Vec<&str> = batch.instrument_strs().into_iter().map(|opt| opt.unwrap()).collect(); + Some(flat) + } else { + None + }; + + let predictions = model.predict(&peptides, &mods, &mod_sites, charges, nces, instruments.as_ref())?; + + if let PredictionResult::MS2Result(ms2_preds) = predictions { + let total_error: f32 = ms2_preds + .iter() + .zip(batch.ms2_intensities.iter()) + .map(|(pred, obs)| { + pred.iter() + .zip(obs.as_ref().unwrap()) + .map(|(p_row, o_row)| { + p_row.iter().zip(o_row.iter()).map(|(p, o)| (p - o).abs()).sum::() }) - .sum::(); // Sum the differences from the outer vectors - - - print_predictions(&prediction_context.peptides, &ms2_preds, &prediction_context.ms2_intensities); - - let mean_absolute_error = total_error / ms2_preds.len() as f32; - println!("Mean Absolute Error: {:.6}", mean_absolute_error); - } else { - println!("Unexpected prediction result type."); - } - } - Err(e) => { - println!("Error during batch prediction: {:?}", e); + .sum::() + }) + .sum(); + + for (i, peptide) in batch.naked_sequence.iter().enumerate() { + let pred_sum: f32 = ms2_preds[i].iter().flatten().sum(); + let obs_sum: f32 = batch.ms2_intensities[i] + .as_ref() + .map(|v| v.iter().flatten().sum()) + .unwrap_or(0.0); + println!( + "Peptide: {}\n Predicted Intensity Sum: {:.4}\n Observed Intensity Sum: {:.4}", + std::str::from_utf8(peptide).unwrap_or(""), pred_sum, obs_sum + ); } - } - Ok(()) -} -fn print_predictions( - peptides: &[String], - predicted_ms2_intensities: &Vec>>, - observed_ms2_intensities: &Vec>>, -) { - let mut peptides_iter = peptides.iter(); - let mut predicted_iter = predicted_ms2_intensities.iter(); - let mut observed_iter = observed_ms2_intensities.iter(); - - loop { - match ( - peptides_iter.next(), - predicted_iter.next(), - observed_iter.next(), - ) { - (Some(pep), Some(predicted), Some(observed)) => { - let predicted_sum: f32 = predicted.iter().flat_map(|inner_vec| inner_vec.iter().copied()).sum(); - let observed_sum: f32 = observed.iter().flat_map(|inner_vec| inner_vec.iter().copied()).sum(); - - - println!("Peptide: {}", pep); - println!(" Sum of Predicted Intensities: {:.6}", predicted_sum); - println!(" Sum of Observed Intensities: {:.6}", observed_sum); - } - _ => break, // Exit the loop if any iterator is exhausted - } + let mean_abs_error = total_error / ms2_preds.len() as f32; + println!("Mean Absolute Error: {:.6}", mean_abs_error); } + Ok(()) } fn main() -> Result<()> { let model_path = PathBuf::from("data/models/alphapeptdeep/generic/ms2.pth"); let constants_path = PathBuf::from("data/models/alphapeptdeep/generic/ms2.pth.model_const.yaml"); - - // let device use cuda if available otherwise use cpu let device = Device::new_cuda(0).unwrap_or(Device::Cpu); - println!("Device: {:?}", device); - let mut model = MS2BertModel::new(&model_path, Some(&constants_path), 0, 8, 4, true, device) - .context("Failed to create MS2BertModel")?; + let mut model = MS2BertModel::new(&model_path, Some(&constants_path), 0, 8, 4, true, device)?; - // Open the CSV file - let file_path = "data/predicted_fragment_intensities.csv"; - let file = File::open(file_path).unwrap(); - - // Create a CSV reader + let file = File::open("data/predicted_fragment_intensities.csv")?; let mut rdr = Reader::from_reader(file); - // Group fragment intensities by peptide sequence - let mut peptide_data_map: HashMap>> = HashMap::new(); - let mut peptide_charges: HashMap = HashMap::new(); - - for result in rdr.records() { - let record = result.unwrap(); - let peptide_sequence = &record[0]; - let precursor_charge: i32 = record[1].parse().unwrap(); - let fragment_type = &record[2]; - let fragment_ordinal: usize = record[3].parse().unwrap(); - let fragment_charge: i32 = record[4].parse().unwrap(); - let experimental_intensity: f32 = record[6].parse().unwrap(); - - // Get naked peptide sequence - let naked_peptide = remove_mass_shift(peptide_sequence); - - // Get length of the peptide sequence - let peptide_len = naked_peptide.len() - 1; - - // Initialize the peptide's intensity matrix if it doesn't exist - peptide_data_map - .entry(peptide_sequence.to_string()) - .or_insert_with(|| vec![vec![0.0; 8]; peptide_len]); // Initialize with enough rows - - // Update the peptide's charge - peptide_charges.insert(peptide_sequence.to_string(), precursor_charge); - - // Determine the column index based on fragment type and charge - let col = match (fragment_type, fragment_charge) { - ("B", 1) => 0, // b_z1 - ("B", 2) => 1, // b_z2 - ("Y", 1) => 2, // y_z1 - ("Y", 2) => 3, // y_z2 - _ => continue, // Skip unsupported fragment types or charges - }; - - // Update the MS2 intensities matrix - let row = peptide_len - 1; // Convert to zero-based index - peptide_data_map - .get_mut(peptide_sequence) - .unwrap() - .resize(row + 1, vec![0.0; 8]); // Ensure the matrix has enough rows - peptide_data_map.get_mut(peptide_sequence).unwrap()[row][col] = experimental_intensity; - } - - // Create PeptideData instances for each peptide - let mut training_data: Vec = Vec::new(); - - for (sequence, ms2_intensities) in peptide_data_map { - let charge = peptide_charges.get(&sequence).copied(); - let peptide_data = PeptideData::new( - &sequence, - charge, - Some(20), // Example NCE - Some("QE"), // Example instrument - None, // Retention time - None, // Ion mobility - Some(ms2_intensities), // MS2 intensities - ); - training_data.push(peptide_data); - } - - println!("Loaded {} peptides from the CSV file.", training_data.len()); - - // Create the prediction context using the training data - let modifications = load_modifications().context("Failed to load modifications")?; - let prediction_context = PredictionContext::new(&training_data, &modifications); - - // Run prediction using the training data as the test data - let result = run_prediction(&mut model, &prediction_context, ); - - match result { - Ok(_) => println!("Ran prediction successfully."), - Err(e) => println!("Failed to run prediction: {:?}", e), + let mut data_map: HashMap>> = HashMap::new(); + let mut charge_map: HashMap = HashMap::new(); + + for rec in rdr.records() { + let rec = rec?; + let seq = &rec[0]; + let charge: i32 = rec[1].parse()?; + let ftype = &rec[2]; + let idx: usize = rec[3].parse()?; + let fz: i32 = rec[4].parse()?; + let intensity: f32 = rec[6].parse()?; + + let naked = remove_mass_shift(seq); + let len = naked.len().saturating_sub(1); + charge_map.insert(seq.clone().to_string(), charge); + + let entry = data_map.entry(seq.to_string()).or_insert_with(|| vec![vec![0.0; 8]; len]); + if let Some(row) = entry.get_mut(idx.saturating_sub(1)) { + let col = match (ftype, fz) { + ("B", 1) => 0, + ("B", 2) => 1, + ("Y", 1) => 2, + ("Y", 2) => 3, + _ => continue, + }; + row[col] = intensity; + } } - // Fine-tune the model - - let learning_rate = 0.001; - let epochs = 5; - let result = model - .fine_tune(&training_data, modifications, 3, learning_rate, epochs) - .context("Failed to fine-tune the model"); - - match result { - Ok(_) => println!("Model fine-tuned successfully."), - Err(e) => println!("Failed to fine-tune model: {:?}", e), + let modifications = load_modifications()?; + let mut training_data = Vec::new(); + + for (mod_seq, ms2) in data_map { + let naked = remove_mass_shift(&mod_seq).trim_start_matches('-').to_string(); + let mods = get_modification_string(&mod_seq, &modifications); + let mod_sites = get_modification_indices(&mod_seq); + + training_data.push(PeptideData::new( + &mod_seq, + &naked, + &mods, + &mod_sites, + charge_map.get(&mod_seq).copied(), + None, + Some(20), + Some("QE"), + None, + None, + None, + Some(ms2), + )); } - // Test prediction again with a few peptides after fine-tuning - let result = run_prediction(&mut model, &prediction_context); + println!("Loaded {} peptides.", training_data.len()); + run_prediction(&mut model, &training_data)?; - match result { - Ok(_) => println!("Ran prediction successfully."), - Err(e) => println!("Failed to run prediction: {:?}", e), - } + model.fine_tune(&training_data, modifications, 3, 0.001, 5)?; + run_prediction(&mut model, &training_data)?; Ok(()) } diff --git a/crates/redeem-properties/examples/alphapeptdeep_rt_cnn_lstm.rs b/crates/redeem-properties/examples/alphapeptdeep_rt_cnn_lstm.rs index 7408473..f206ad7 100644 --- a/crates/redeem-properties/examples/alphapeptdeep_rt_cnn_lstm.rs +++ b/crates/redeem-properties/examples/alphapeptdeep_rt_cnn_lstm.rs @@ -5,32 +5,27 @@ use redeem_properties::{ model_interface::{ModelInterface, PredictionResult}, rt_cnn_lstm_model::RTCNNLSTMModel, }, - utils::{data_handling::PeptideData, peptdeep_utils::load_modifications}, + utils::{ + data_handling::{PeptideData, PeptideBatchData}, + peptdeep_utils::{load_modifications, remove_mass_shift, get_modification_string, get_modification_indices}, + }, }; use std::path::PathBuf; +use std::sync::Arc; struct PredictionContext { - peptides: Vec, - mods: Vec, - mod_sites: Vec, + peptides: Vec<&'static str>, + mods: Vec<&'static str>, + mod_sites: Vec<&'static str>, observed_rts: Vec, } impl PredictionContext { - fn new(test_peptides: &Vec<(&str, &str, &str, f32)>) -> Self { - let peptides: Vec = test_peptides - .iter() - .map(|(pep, _, _, _)| pep.to_string()) - .collect(); - let mods: Vec = test_peptides - .iter() - .map(|(_, mod_, _, _)| mod_.to_string()) - .collect(); - let mod_sites: Vec = test_peptides - .iter() - .map(|(_, _, sites, _)| sites.to_string()) - .collect(); - let observed_rts: Vec = test_peptides.iter().map(|(_, _, _, rt)| *rt).collect(); + fn new(test_peptides: &[(&'static str, &'static str, &'static str, f32)]) -> Self { + let peptides = test_peptides.iter().map(|(pep, _, _, _)| *pep).collect(); + let mods = test_peptides.iter().map(|(_, m, _, _)| *m).collect(); + let mod_sites = test_peptides.iter().map(|(_, _, site, _)| *site).collect(); + let observed_rts = test_peptides.iter().map(|(_, _, _, rt)| *rt).collect(); Self { peptides, @@ -41,74 +36,43 @@ impl PredictionContext { } } -fn run_prediction( - model: &mut RTCNNLSTMModel, - prediction_context: &PredictionContext, -) -> Result<()> { +fn run_prediction(model: &mut RTCNNLSTMModel, context: &PredictionContext) -> Result<()> { match model.predict( - &prediction_context.peptides, - &prediction_context.mods, - &prediction_context.mod_sites, + &context.peptides, + &context.mods, + &context.mod_sites, None, None, None, ) { - Ok(predictions) => { - if let PredictionResult::RTResult(rt_preds) = predictions { + Ok(preds) => { + if let PredictionResult::RTResult(rt_preds) = preds { let total_error: f32 = rt_preds .iter() - .zip(prediction_context.observed_rts.iter()) - .map(|(pred, obs)| (pred - obs).abs()) + .zip(&context.observed_rts) + .map(|(p, o)| (p - o).abs()) .sum(); - print_predictions( - &prediction_context.peptides, - &rt_preds, - &prediction_context.observed_rts, - ); + for ((pep, pred), obs) in context.peptides.iter().zip(rt_preds.iter()).zip(&context.observed_rts) { + println!("Peptide: {}, Predicted RT: {:.6}, Observed RT: {:.6}", pep, pred, obs); + } - let mean_absolute_error = total_error / rt_preds.len() as f32; - println!("Mean Absolute Error: {:.6}", mean_absolute_error); - } else { - println!("Unexpected prediction result type."); - } - } - Err(e) => { - println!("Error during batch prediction: {:?}", e); - } - } - Ok(()) -} - -fn print_predictions(peptides: &[String], rt_preds: &[f32], observed_rts: &[f32]) { - let mut peptides_iter = peptides.iter(); - let mut rt_preds_iter = rt_preds.iter(); - let mut observed_rts_iter = observed_rts.iter(); - - loop { - match ( - peptides_iter.next(), - rt_preds_iter.next(), - observed_rts_iter.next(), - ) { - (Some(pep), Some(pred), Some(obs)) => { println!( - "Peptide: {}, Predicted RT: {}, Observed RT: {}", - pep, pred, obs + "Mean Absolute Error: {:.6}", + total_error / rt_preds.len() as f32 ); } - _ => break, // Exit the loop if any iterator is exhausted } + Err(e) => println!("Prediction error: {e}"), } + Ok(()) } fn main() -> Result<()> { env_logger::init(); - // let model_path = PathBuf::from("data/models/alphapeptdeep/generic/rt.pth"); + let model_path = PathBuf::from("/home/singjc/Documents/github/redeem/rt_fine_tuned.safetensors"); let constants_path = PathBuf::from("/home/singjc/Documents/github/redeem/crates/redeem-properties/data/models/alphapeptdeep/generic/rt.pth.model_const.yaml"); - - // let device use cuda if available otherwise use cpu let device = Device::new_cuda(0).unwrap_or(Device::Cpu); println!("Device: {:?}", device); @@ -116,142 +80,57 @@ fn main() -> Result<()> { let mut model = RTCNNLSTMModel::new(&model_path, Some(&constants_path), 0, 8, 4, true, device) .context("Failed to create RTCNNLSTMModel")?; - // Define training data + let modifications = load_modifications().context("Failed to load modifications")?; + let training_data: Vec = vec![ - PeptideData::new("AKPLMELIER", None, None, None, Some(0.4231399), None, None), + "AKPLMELIER", + "TEM[+15.9949]VTISDASQR", + "AGKFPSLLTHNENMVAK", + "LSELDDRADALQAGASQFETSAAK", + "FLLQDTVELR", + "SVTEQGAELSNEER", + "EHALLAYTLGVK", + "TVQSLEIDLDSM[+15.9949]R", + "VVSQYSSLLSPMSVNAVM[+15.9949]K", + "TFLALINQVFPAEEDSKK", + ] + .into_iter() + .enumerate() + .map(|(i, seq)| { + let naked = remove_mass_shift(seq); + let mods = get_modification_string(seq, &modifications); + let sites = get_modification_indices(seq); PeptideData::new( - "TEM[+15.9949]VTISDASQR", + seq, + &naked, + &mods, + &sites, None, None, None, - Some(0.2192762), None, - None, - ), - PeptideData::new( - "AGKFPSLLTHNENMVAK", - None, - None, - None, - Some(0.3343900), - None, - None, - ), - PeptideData::new( - "LSELDDRADALQAGASQFETSAAK", - None, - None, - None, - Some(0.5286755), - None, - None, - ), - PeptideData::new("FLLQDTVELR", None, None, None, Some(0.6522490), None, None), - PeptideData::new( - "SVTEQGAELSNEER", + Some(i as f32 / 10.0), None, None, None, - Some(0.2388270), - None, - None, - ), - PeptideData::new( - "EHALLAYTLGVK", - None, - None, - None, - Some(0.5360210), - None, - None, - ), - PeptideData::new( - "TVQSLEIDLDSM[+15.9949]R", - None, - None, - None, - Some(0.5787880), - None, - None, - ), - PeptideData::new( - "VVSQYSSLLSPMSVNAVM[+15.9949]K", - None, - None, - None, - Some(0.6726230), - None, - None, - ), - PeptideData::new( - "TFLALINQVFPAEEDSKK", - None, - None, - None, - Some(0.8345350), - None, - None, - ), - ]; + ) + }) + .collect(); - // Test prediction data let test_peptides_data = vec![ ("QPYAVSELAGHQTSAESWGTGR", "", "", 0.4328955), ("GMSVSDLADKLSTDDLNSLIAHAHR", "Oxidation@M", "1", 0.6536107), - ( - "TVQHHVLFTDNMVLICR", - "Oxidation@M;Carbamidomethyl@C", - "11;15", - 0.7811949, - ), + ("TVQHHVLFTDNMVLICR", "Oxidation@M;Carbamidomethyl@C", "11;15", 0.7811949), ("EAELDVNEELDKK", "", "", 0.2934583), ("YTPVQQGPVGVNVTYGGDPIPK", "", "", 0.5863009), - ("YYAIDFTLDEIK", "", "", 0.8048359), - ("VSSLQAEPLPR", "", "", 0.3201348), - ( - "NHAVVCQGCHNAIDPEVQR", - "Carbamidomethyl@C;Carbamidomethyl@C", - "5;8", - 0.1730425, - ), - ("IPNIYAIGDVVAGPMLAHK", "", "", 0.8220097), - ("AELGIPLEEVPPEEINYLTR", "", "", 0.8956433), - ("NESTPPSEELELDKWK", "", "", 0.4471560), - ("SIQEIQELDKDDESLR", "", "", 0.4157068), - ("EMEENFAVEAANYQDTIGR", "Oxidation@M", "1", 0.6388353), - ("MDSFDEDLARPSGLLAQER", "Oxidation@M", "0", 0.5593624), - ("SLLTEADAGHTEFTDEVYQNESR", "", "", 0.5538696), - ("NQDLAPNSAEQASILSLVTK", "", "", 0.7682227), - ("GKVEEVELPVEK", "", "", 0.2943246), - ("IYVASVHQDLSDDDIK", "", "", 0.3847130), - ("IKGDMDISVPK", "", "", 0.2844255), - ("IIPVLLEHGLER", "", "", 0.5619017), - ("AGYTDKVVIGMDVAASEFFR", "", "", 0.8972052), - ("TDYNASVSVPDSSGPER", "", "", 0.3279318), - ("DLKPQNLLINTEGAIK", "", "", 0.6046495), - ("VAEAIAASFGSFADFK", "", "", 0.8935943), - ("AMVSNAQLDNEK", "Oxidation@M", "1", 0.1724159), - ("THINIVVIGHVDSGK", "", "", 0.4865058), - ("LILPHVDIQLK", "", "", 0.6268850), - ("LIAPVAEEEATVPNNK", "", "", 0.4162872), - ("FTASAGIQVVGDDLTVTNPK", "", "", 0.7251064), - ("HEDLKDMLEFPAQELR", "", "", 0.6529368), - ("LLPDFLLER", "", "", 0.7852863), ]; let prediction_context = PredictionContext::new(&test_peptides_data); run_prediction(&mut model, &prediction_context)?; - // Fine-tune the model - let modifications = load_modifications().context("Failed to load modifications")?; - let learning_rate = 0.001; - let epochs = 5; - model - .fine_tune(&training_data, modifications, 10, learning_rate, epochs) - .context("Failed to fine-tune the model")?; + model.fine_tune(&training_data, modifications, 10, 0.001, 5)?; - // Test prediction again with a few peptides after fine-tuning run_prediction(&mut model, &prediction_context)?; model.save("alphapeptdeep_rt_cnn_lstm_finetuned.safetensors")?; diff --git a/crates/redeem-properties/src/models/ccs_cnn_lstm_model.rs b/crates/redeem-properties/src/models/ccs_cnn_lstm_model.rs index fa8489a..58b08cb 100644 --- a/crates/redeem-properties/src/models/ccs_cnn_lstm_model.rs +++ b/crates/redeem-properties/src/models/ccs_cnn_lstm_model.rs @@ -335,9 +335,9 @@ mod tests { let device = Device::Cpu; let model = CCSCNNLSTMModel::new(model_path, Some(constants_path), 0, 8, 4, true, device).unwrap(); - let peptide_sequences = vec!["AGHCEWQMKYR".to_string(), "AGHCEWQMKYR".to_string()]; - let mods = vec!["Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M".to_string(), "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M".to_string()]; - let mod_sites = vec!["0;4;8".to_string(), "0;4;8".to_string()]; + let peptide_sequences = vec!["AGHCEWQMKYR", "AGHCEWQMKYR"]; + let mods = vec!["Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M", "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M"]; + let mod_sites = vec!["0;4;8", "0;4;8"]; let charge = Some(vec![2, 2]); let result = model.predict(&peptide_sequences, &mods, &mod_sites, charge, None, None); diff --git a/crates/redeem-properties/src/models/ccs_model.rs b/crates/redeem-properties/src/models/ccs_model.rs index 43b5c87..a9f667c 100644 --- a/crates/redeem-properties/src/models/ccs_model.rs +++ b/crates/redeem-properties/src/models/ccs_model.rs @@ -42,7 +42,7 @@ impl CCSModelWrapper { Ok(Self { model }) } - pub fn predict(&self, peptide_sequence: &[String], mods: &[String], mod_sites: &[String], charge: Vec) -> Result { + pub fn predict(&self, peptide_sequence: &Vec<&str>, mods: &Vec<&str>, mod_sites: &Vec<&str>, charge: Vec) -> Result { self.model.predict(peptide_sequence, mods, mod_sites, Some(charge), None, None) } diff --git a/crates/redeem-properties/src/models/model_interface.rs b/crates/redeem-properties/src/models/model_interface.rs index 5f41e76..8d67d51 100644 --- a/crates/redeem-properties/src/models/model_interface.rs +++ b/crates/redeem-properties/src/models/model_interface.rs @@ -2,7 +2,7 @@ use crate::{ building_blocks::featurize::{self, aa_indices_tensor, get_mod_features_from_parsed}, models::{ccs_model::CCSModelWrapper, ms2_model::MS2ModelWrapper, rt_model::RTModelWrapper}, utils::{ - data_handling::{PeptideData, RTNormalization}, logging::Progress, peptdeep_utils::{ + data_handling::{PeptideBatchData, PeptideData, RTNormalization}, logging::Progress, peptdeep_utils::{ get_modification_indices, get_modification_string, parse_instrument_index, remove_mass_shift, }, stats::{compute_loss_stats, Metrics, TrainingPhase, TrainingStepMetrics}, utils::{get_tensor_stats, CosineWithWarmup, LRScheduler} @@ -242,16 +242,16 @@ pub trait ModelInterface: Send + Sync + ModelClone { /// A vector of predicted retention times. fn predict( &self, - peptide_sequences: &[String], - mods: &[String], - mod_sites: &[String], + peptide_sequences: &Vec<&str>, + mods: &Vec<&str>, + mod_sites: &Vec<&str>, charge: Option>, nce: Option>, - instrument: Option>, + instrument: Option<&Vec<&str>>, ) -> Result { // Encode the batch of peptides let input_tensor = self - .encode_peptides(peptide_sequences, mods, mod_sites, charge, nce, instrument)? + .encode_peptides(peptide_sequences, mods, mod_sites, charge, nce, instrument.cloned())? .to_device(self.get_device())?; // Forward pass through the model @@ -294,7 +294,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { let mod_to_feature = self.get_mod_to_feature(); log::trace!("[ModelInterface::encode_peptide] peptide_sequence: {} | mods: {} | mod_sites: {} | charge: {:?} | nce: {:?} | instrument: {:?}", peptide_sequence, mods, mod_sites, charge, nce, instrument); - + let aa_tensor = aa_indices_tensor(peptide_sequence, device)?; let (batch_size, seq_len, _) = aa_tensor.shape().dims3()?; @@ -367,12 +367,12 @@ pub trait ModelInterface: Send + Sync + ModelClone { /// Encode a batch of peptide sequences into a tensor fn encode_peptides( &self, - peptide_sequences: &[String], - mods: &[String], - mod_sites: &[String], + peptide_sequences: &Vec<&str>, + mods: &Vec<&str>, + mod_sites: &Vec<&str>, charges: Option>, nces: Option>, - instruments: Option>, + instruments: Option,>, ) -> Result { let len = peptide_sequences.len(); @@ -385,7 +385,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { &mod_sites[i], charges.as_ref().map(|v| v[i]), nces.as_ref().map(|v| v[i]), - instruments.as_ref().map(|v| v[i].as_str()), + instruments.as_ref().map(|v| v[i]), ) }) .collect::>>()?; @@ -715,166 +715,167 @@ pub trait ModelInterface: Send + Sync + ModelClone { learning_rate: f64, epochs: usize, ) -> Result<()> { - let num_batches = if training_data.len() < batch_size { - 1 - } else { - let full_batches = training_data.len() / batch_size; - if training_data.len() % batch_size > 0 { - full_batches + 1 - } else { - full_batches - } - }; - - info!( - "Fine-tuning {} model on {} peptide features ({} batches) for {} epochs", - self.get_model_arch(), - training_data.len(), - num_batches, - epochs - ); - - let params = candle_nn::ParamsAdamW { - lr: learning_rate, - ..Default::default() - }; - let mut opt = candle_nn::AdamW::new(self.get_mut_varmap().all_vars(), params)?; - - for epoch in 0..epochs { - let progress = Progress::new(num_batches, &format!("[fine-tuning] Epoch {}: ", epoch)); - let mut total_loss = 0.0; - - for batch_idx in 0..num_batches { - let start = batch_idx * batch_size; - let end = (start + batch_size).min(training_data.len()); - let batch_data = &training_data[start..end]; - - let peptides: Vec = batch_data - .iter() - .map(|p| remove_mass_shift(&p.sequence)) - .collect(); - let mods: Vec = batch_data - .iter() - .map(|p| get_modification_string(&p.sequence, &modifications)) - .collect(); - let mod_sites: Vec = batch_data - .iter() - .map(|p| get_modification_indices(&p.sequence)) - .collect(); - - let charges = batch_data - .iter() - .filter_map(|p| p.charge) - .collect::>(); - let charges = if charges.len() == batch_data.len() { - Some(charges) - } else { - None - }; - - let nces = batch_data.iter().filter_map(|p| p.nce).collect::>(); - let nces = if nces.len() == batch_data.len() { - Some(nces) - } else { - None - }; - - let instruments = batch_data - .iter() - .filter_map(|p| p.instrument.clone()) - .collect::>(); - let instruments = if instruments.len() == batch_data.len() { - Some(instruments) - } else { - None - }; - - let input_batch = self - .encode_peptides(&peptides, &mods, &mod_sites, charges, nces, instruments)? - .to_device(self.get_device())?; - - log::trace!( - "[ModelInterface::fine_tune] input_batch shape: {:?}, device: {:?}", - input_batch.shape(), - input_batch.device() - ); - - let batch_targets = match self.property_type() { - PropertyType::RT => PredictionResult::RTResult( - batch_data - .iter() - .map(|p| p.retention_time.unwrap_or_default()) - .collect(), - ), - PropertyType::CCS => PredictionResult::CCSResult( - batch_data - .iter() - .map(|p| p.ion_mobility.unwrap_or_default()) - .collect(), - ), - PropertyType::MS2 => PredictionResult::MS2Result( - batch_data - .iter() - .map(|p| p.ms2_intensities.clone().unwrap_or_default()) - .collect(), - ), - }; - - let target_batch = match batch_targets { - PredictionResult::RTResult(ref values) - | PredictionResult::CCSResult(ref values) => { - Tensor::new(values.clone(), &self.get_device())? - } - PredictionResult::MS2Result(ref spectra) => { - let max_len = spectra.iter().map(|s| s.len()).max().unwrap_or(1); - let feature_dim = spectra - .get(0) - .and_then(|s| s.get(0)) - .map(|v| v.len()) - .unwrap_or(1); - let mut padded_spectra = spectra.clone(); - for s in &mut padded_spectra { - s.resize(max_len, vec![0.0; feature_dim]); - } - Tensor::new(padded_spectra.concat(), &self.get_device())?.reshape(( - batch_data.len(), - max_len, - feature_dim, - ))? - } - } - .to_device(self.get_device())?; - - let predicted = self.forward(&input_batch)?; - let loss = candle_nn::loss::mse(&predicted, &target_batch)?; - opt.backward_step(&loss)?; - - total_loss += loss.to_vec0::().unwrap_or(990.0); - - progress.update_description(&format!( - "[fine-tuning] Epoch {}: Loss: {}", - epoch, - loss.to_vec0::()? - )); - progress.inc(); - } - - let avg_loss = total_loss / num_batches as f32; - progress.update_description(&format!( - "[fine-tuning] Epoch {}: Avg. Batch Loss: {}", - epoch, avg_loss - )); - progress.finish(); - } - - Ok(()) + // let num_batches = if training_data.len() < batch_size { + // 1 + // } else { + // let full_batches = training_data.len() / batch_size; + // if training_data.len() % batch_size > 0 { + // full_batches + 1 + // } else { + // full_batches + // } + // }; + + // info!( + // "Fine-tuning {} model on {} peptide features ({} batches) for {} epochs", + // self.get_model_arch(), + // training_data.len(), + // num_batches, + // epochs + // ); + + // let params = candle_nn::ParamsAdamW { + // lr: learning_rate, + // ..Default::default() + // }; + // let mut opt = candle_nn::AdamW::new(self.get_mut_varmap().all_vars(), params)?; + + // for epoch in 0..epochs { + // let progress = Progress::new(num_batches, &format!("[fine-tuning] Epoch {}: ", epoch)); + // let mut total_loss = 0.0; + + // for batch_idx in 0..num_batches { + // let start = batch_idx * batch_size; + // let end = (start + batch_size).min(training_data.len()); + // let batch_data = &training_data[start..end]; + + // let peptides: Vec = batch_data + // .iter() + // .map(|p| remove_mass_shift(&p.sequence)) + // .collect(); + // let mods: Vec = batch_data + // .iter() + // .map(|p| get_modification_string(&p.sequence, &modifications)) + // .collect(); + // let mod_sites: Vec = batch_data + // .iter() + // .map(|p| get_modification_indices(&p.sequence)) + // .collect(); + + // let charges = batch_data + // .iter() + // .filter_map(|p| p.charge) + // .collect::>(); + // let charges = if charges.len() == batch_data.len() { + // Some(charges) + // } else { + // None + // }; + + // let nces = batch_data.iter().filter_map(|p| p.nce).collect::>(); + // let nces = if nces.len() == batch_data.len() { + // Some(nces) + // } else { + // None + // }; + + // let instruments = batch_data + // .iter() + // .filter_map(|p| p.instrument.clone()) + // .collect::>(); + // let instruments = if instruments.len() == batch_data.len() { + // Some(instruments) + // } else { + // None + // }; + + // let input_batch = self + // .encode_peptides(&peptides, &mods, &mod_sites, charges, nces, instruments)? + // .to_device(self.get_device())?; + + // log::trace!( + // "[ModelInterface::fine_tune] input_batch shape: {:?}, device: {:?}", + // input_batch.shape(), + // input_batch.device() + // ); + + // let batch_targets = match self.property_type() { + // PropertyType::RT => PredictionResult::RTResult( + // batch_data + // .iter() + // .map(|p| p.retention_time.unwrap_or_default()) + // .collect(), + // ), + // PropertyType::CCS => PredictionResult::CCSResult( + // batch_data + // .iter() + // .map(|p| p.ion_mobility.unwrap_or_default()) + // .collect(), + // ), + // PropertyType::MS2 => PredictionResult::MS2Result( + // batch_data + // .iter() + // .map(|p| p.ms2_intensities.clone().unwrap_or_default()) + // .collect(), + // ), + // }; + + // let target_batch = match batch_targets { + // PredictionResult::RTResult(ref values) + // | PredictionResult::CCSResult(ref values) => { + // Tensor::new(values.clone(), &self.get_device())? + // } + // PredictionResult::MS2Result(ref spectra) => { + // let max_len = spectra.iter().map(|s| s.len()).max().unwrap_or(1); + // let feature_dim = spectra + // .get(0) + // .and_then(|s| s.get(0)) + // .map(|v| v.len()) + // .unwrap_or(1); + // let mut padded_spectra = spectra.clone(); + // for s in &mut padded_spectra { + // s.resize(max_len, vec![0.0; feature_dim]); + // } + // Tensor::new(padded_spectra.concat(), &self.get_device())?.reshape(( + // batch_data.len(), + // max_len, + // feature_dim, + // ))? + // } + // } + // .to_device(self.get_device())?; + + // let predicted = self.forward(&input_batch)?; + // let loss = candle_nn::loss::mse(&predicted, &target_batch)?; + // opt.backward_step(&loss)?; + + // total_loss += loss.to_vec0::().unwrap_or(990.0); + + // progress.update_description(&format!( + // "[fine-tuning] Epoch {}: Loss: {}", + // epoch, + // loss.to_vec0::()? + // )); + // progress.inc(); + // } + + // let avg_loss = total_loss / num_batches as f32; + // progress.update_description(&format!( + // "[fine-tuning] Epoch {}: Avg. Batch Loss: {}", + // epoch, avg_loss + // )); + // progress.finish(); + // } + + // Ok(()) + todo!() } fn inference( &self, inference_data: &Vec, batch_size: usize, - modifications: HashMap< + _modifications: HashMap< (String, Option), crate::utils::peptdeep_utils::ModificationMap, >, @@ -895,49 +896,33 @@ pub trait ModelInterface: Send + Sync + ModelClone { .enumerate() .map(|(batch_idx, batch_data)| { let start_idx = batch_idx * batch_size; + let batch: PeptideBatchData = batch_data.into(); - let peptides: Vec = batch_data - .iter() - .map(|p| remove_mass_shift(&p.sequence)) - .collect(); - let mods: Vec = batch_data - .iter() - .map(|p| get_modification_string(&p.sequence, &modifications)) - .collect(); - let mod_sites: Vec = batch_data - .iter() - .map(|p| get_modification_indices(&p.sequence)) - .collect(); + let naked_sequences = batch.naked_sequence_strs(); + let mods = batch.mods_strs(); + let mod_sites = batch.mod_sites_strs(); - let charges = batch_data - .iter() - .filter_map(|p| p.charge) - .collect::>(); - let charges = if charges.len() == batch_data.len() { - Some(charges) + let charges = if batch.charges.iter().all(|c| c.is_some()) { + Some(batch.charges.iter().map(|c| c.unwrap()).collect::>()) } else { None }; - let nces = batch_data.iter().filter_map(|p| p.nce).collect::>(); - let nces = if nces.len() == batch_data.len() { - Some(nces) + let nces = if batch.nces.iter().all(|n| n.is_some()) { + Some(batch.nces.iter().map(|n| n.unwrap()).collect::>()) } else { None }; - let instruments = batch_data - .iter() - .filter_map(|p| p.instrument.clone()) - .collect::>(); - let instruments = if instruments.len() == batch_data.len() { - Some(instruments) + let instruments = if batch.instruments.iter().all(|i| i.is_some()) { + let flat: Vec<&str> = batch.instrument_strs().into_iter().map(|opt| opt.unwrap()).collect(); + Some(flat) } else { None }; let input_tensor = self - .encode_peptides(&peptides, &mods, &mod_sites, charges, nces, instruments)? + .encode_peptides(&naked_sequences, &mods, &mod_sites, charges, nces, instruments)? .to_device(self.get_device())?; let output = self.forward(&input_tensor)?; @@ -981,34 +966,66 @@ pub trait ModelInterface: Send + Sync + ModelClone { progress.finish(); Ok(result.into_iter().flatten().collect()) } + /// Extract encoded input and target tensor for a batch of peptides. fn prepare_batch_inputs( &self, batch_data: &[PeptideData], - modifications: &HashMap<(String, Option), crate::utils::peptdeep_utils::ModificationMap>, + _modifications: &HashMap<(String, Option), crate::utils::peptdeep_utils::ModificationMap>, ) -> Result<(Tensor, Tensor)> { - let peptides: Vec = batch_data.par_iter().map(|p| remove_mass_shift(&p.sequence)).collect(); + use rayon::prelude::*; - let mods: Vec = batch_data.par_iter().map(|p| get_modification_string(&p.sequence, modifications)).collect(); + let batch: PeptideBatchData = batch_data.into(); - let mod_sites: Vec = batch_data.par_iter().map(|p| get_modification_indices(&p.sequence)).collect(); + let naked_sequences = batch.naked_sequence_strs(); - let charges = batch_data.par_iter().filter_map(|p| p.charge).collect::>(); - let charges = if charges.len() == batch_data.len() { Some(charges) } else { None }; + let mods = batch.mods_strs(); - let nces = batch_data.par_iter().filter_map(|p| p.nce).collect::>(); - let nces = if nces.len() == batch_data.len() { Some(nces) } else { None }; + let mod_sites = batch.mod_sites_strs(); - let instruments = batch_data.par_iter().filter_map(|p| p.instrument.clone()).collect::>(); - let instruments = if instruments.len() == batch_data.len() { Some(instruments) } else { None }; + let charges = if batch.charges.iter().all(|c| c.is_some()) { + Some(batch.charges.iter().map(|c| c.unwrap()).collect::>()) + } else { + None + }; - let input_batch = self.encode_peptides(&peptides, &mods, &mod_sites, charges, nces, instruments)?.to_device(self.get_device())?; + let nces = if batch.nces.iter().all(|n| n.is_some()) { + Some(batch.nces.iter().map(|n| n.unwrap()).collect::>()) + } else { + None + }; + + let instruments = if batch.instruments.iter().all(|i| i.is_some()) { + let flat: Vec<&str> = batch + .instrument_strs() + .into_iter() + .map(|opt| opt.unwrap()) + .collect(); + Some(flat) + } else { + None + }; + + + let input_batch = self + .encode_peptides(&naked_sequences, &mods, &mod_sites, charges, nces, instruments)? + .to_device(self.get_device())?; let target_values: Vec = match self.property_type() { - PropertyType::RT => batch_data.par_iter().map(|p| p.retention_time.unwrap_or_default()).collect(), - PropertyType::CCS => batch_data.par_iter().map(|p| p.ion_mobility.unwrap_or_default()).collect(), - PropertyType::MS2 => return Err(anyhow::anyhow!("MS2 training is not yet implemented")), + PropertyType::RT => batch + .retention_times + .iter() + .map(|v| v.unwrap_or(0.0)) + .collect(), + PropertyType::CCS => batch + .ion_mobilities + .iter() + .map(|v| v.unwrap_or(0.0)) + .collect(), + PropertyType::MS2 => { + return Err(anyhow::anyhow!("MS2 training is not yet implemented")) + } }; let target_tensor = Tensor::new(target_values, &self.get_device())?; diff --git a/crates/redeem-properties/src/models/ms2_bert_model.rs b/crates/redeem-properties/src/models/ms2_bert_model.rs index 32a634b..86ac41c 100644 --- a/crates/redeem-properties/src/models/ms2_bert_model.rs +++ b/crates/redeem-properties/src/models/ms2_bert_model.rs @@ -499,15 +499,15 @@ mod tests { let device = Device::Cpu; let model = MS2BertModel::new(model_path, Some(constants_path), 0, 8, 4, true, device).unwrap(); - let peptide_sequences = vec!["AGHCEWQMKYR".to_string(), "AGHCEWQMKYR".to_string()]; + let peptide_sequences = vec!["AGHCEWQMKYR", "AGHCEWQMKYR"]; let mods = vec![ - "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M".to_string(), - "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M".to_string(), + "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M", + "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M", ]; - let mod_sites = vec!["0;4;8".to_string(), "0;4;8".to_string()]; + let mod_sites = vec!["0;4;8", "0;4;8"]; let charge = Some(vec![2, 2]); let nce = Some(vec![20, 20]); - let instrument = Some(vec!["QE".to_string(), "QE".to_string()]); + let instrument = Some(vec!["QE", "QE"]); let input_tensor = model .encode_peptides( diff --git a/crates/redeem-properties/src/models/ms2_model.rs b/crates/redeem-properties/src/models/ms2_model.rs index ea3c489..176b7e7 100644 --- a/crates/redeem-properties/src/models/ms2_model.rs +++ b/crates/redeem-properties/src/models/ms2_model.rs @@ -40,7 +40,7 @@ impl MS2ModelWrapper { Ok(Self { model }) } - pub fn predict(&self, peptide_sequence: &[String], mods: &[String], mod_sites: &[String], charge: Vec, nce: Vec, intsrument: Vec) -> Result { + pub fn predict(&self, peptide_sequence: &Vec<&str>, mods: &Vec<&str>, mod_sites: &Vec<&str>, charge: Vec, nce: Vec, intsrument: &Vec<&str>) -> Result { self.model.predict(peptide_sequence, mods, mod_sites, Some(charge), Some(nce), Some(intsrument)) } diff --git a/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs b/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs index 19e3f21..a0f3c84 100644 --- a/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs +++ b/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs @@ -349,16 +349,16 @@ mod tests { // Batched input let peptide_sequences = vec![ - "ACDEFGHIK".to_string(), - "AGHCEWQMKYR".to_string(), + "ACDEFGHIK", + "AGHCEWQMKYR", ]; let mods = vec![ - "Carbamidomethyl@C".to_string(), - "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M".to_string(), + "Carbamidomethyl@C", + "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M", ]; let mod_sites = vec![ - "1".to_string(), - "0;4;8".to_string(), + "1", + "0;4;8", ]; println!("Peptides: {:?}", peptide_sequences); @@ -388,34 +388,23 @@ mod tests { #[test] fn test_prediction() { + let model_path = PathBuf::from("data/models/alphapeptdeep/generic/rt.pth"); - let constants_path = - PathBuf::from("data/models/alphapeptdeep/generic/rt.pth.model_const.yaml"); - let device = /* Assuming Device is defined */ Device::new_cuda(0).unwrap_or(/* assuming Device::Cpu is defined */ Device::Cpu); // Replace with actual Device code. - let result = RTCNNLSTMModel::new(&model_path, Some(&constants_path), 0, 8, 4, true, device); + let constants_path = PathBuf::from("data/models/alphapeptdeep/generic/rt.pth.model_const.yaml"); + let device = Device::new_cuda(0).unwrap_or(Device::Cpu); + let result = RTCNNLSTMModel::new(&model_path, Some(&constants_path), 0, 8, 4, true, device); let mut model = result.unwrap(); - - // Test prediction with a few peptides after fine-tuning + let test_peptides = vec![ ("AGHCEWQMKYR", "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M", "0;4;8", 0.2945), ("QPYAVSELAGHQTSAESWGTGR", "", "", 0.4328955), ("GMSVSDLADKLSTDDLNSLIAHAHR", "Oxidation@M", "1", 0.6536107), - ( - "TVQHHVLFTDNMVLICR", - "Oxidation@M;Carbamidomethyl@C", - "11;15", - 0.7811949, - ), + ("TVQHHVLFTDNMVLICR", "Oxidation@M;Carbamidomethyl@C", "11;15", 0.7811949), ("EAELDVNEELDKK", "", "", 0.2934583), ("YTPVQQGPVGVNVTYGGDPIPK", "", "", 0.5863009), ("YYAIDFTLDEIK", "", "", 0.8048359), ("VSSLQAEPLPR", "", "", 0.3201348), - ( - "NHAVVCQGCHNAIDPEVQR", - "Carbamidomethyl@C;Carbamidomethyl@C", - "5;8", - 0.1730425, - ), + ("NHAVVCQGCHNAIDPEVQR", "Carbamidomethyl@C;Carbamidomethyl@C", "5;8", 0.1730425), ("IPNIYAIGDVVAGPMLAHK", "", "", 0.8220097), ("AELGIPLEEVPPEEINYLTR", "", "", 0.8956433), ("NESTPPSEELELDKWK", "", "", 0.4471560), @@ -440,34 +429,32 @@ mod tests { ("HEDLKDMLEFPAQELR", "", "", 0.6529368), ("LLPDFLLER", "", "", 0.7852863), ]; - - let batch_size = 16; // Set an appropriate batch size - let peptides: Vec = test_peptides.iter().map(|(pep, _, _, _)| pep.to_string()).collect(); - let mods: Vec = test_peptides.iter().map(|(_, mod_, _, _)| mod_.to_string()).collect(); - let mod_sites: Vec = test_peptides.iter().map(|(_, _, sites, _)| sites.to_string()).collect(); + + let peptides: Vec<&str> = test_peptides.iter().map(|(pep, _, _, _)| *pep).collect(); + let mods: Vec<&str> = test_peptides.iter().map(|(_, mod_, _, _)| *mod_).collect(); + let mod_sites: Vec<&str> = test_peptides.iter().map(|(_, _, sites, _)| *sites).collect(); let observed_rts: Vec = test_peptides.iter().map(|(_, _, _, rt)| *rt).collect(); - + match model.predict(&peptides, &mods, &mod_sites, None, None, None) { Ok(predictions) => { - if let /* Assuming PredictionResult and RTResult are defined */ PredictionResult::RTResult(rt_preds) = predictions { // Replace with actual PredictionResult and RTResult code + if let PredictionResult::RTResult(rt_preds) = predictions { let total_error: f32 = rt_preds.iter().zip(observed_rts.iter()) .map(|(pred, obs)| (pred - obs).abs()) .sum(); - - // PRINT PREDICTIONS AND OBSERVED RTs WITHOUT IZIP + let mut peptides_iter = peptides.iter(); let mut rt_preds_iter = rt_preds.iter(); let mut observed_rts_iter = observed_rts.iter(); - + loop { match (peptides_iter.next(), rt_preds_iter.next(), observed_rts_iter.next()) { (Some(pep), Some(pred), Some(obs)) => { println!("Peptide: {}, Predicted RT: {}, Observed RT: {}", pep, pred, obs); } - _ => break, // Exit the loop if any iterator is exhausted + _ => break, } } - + let mean_absolute_error = total_error / rt_preds.len() as f32; println!("Mean Absolute Error: {:.6}", mean_absolute_error); } else { diff --git a/crates/redeem-properties/src/models/rt_model.rs b/crates/redeem-properties/src/models/rt_model.rs index adab38c..c41f056 100644 --- a/crates/redeem-properties/src/models/rt_model.rs +++ b/crates/redeem-properties/src/models/rt_model.rs @@ -47,7 +47,7 @@ impl RTModelWrapper { Ok(Self { model }) } - pub fn predict(&self, peptide_sequence: &[String], mods: &[String], mod_sites: &[String]) -> Result { + pub fn predict(&self, peptide_sequence: &Vec<&str>, mods: &Vec<&str>, mod_sites: &Vec<&str>) -> Result { self.model.predict(peptide_sequence, mods, mod_sites, None, None, None) } diff --git a/crates/redeem-properties/src/utils/data_handling.rs b/crates/redeem-properties/src/utils/data_handling.rs index 2722122..035238d 100644 --- a/crates/redeem-properties/src/utils/data_handling.rs +++ b/crates/redeem-properties/src/utils/data_handling.rs @@ -1,4 +1,5 @@ +use std::sync::Arc; /// Type of RT normalization used #[derive(Debug, Clone, Copy)] @@ -18,13 +19,17 @@ impl RTNormalization { } } + #[derive(Clone)] pub struct PeptideData { - pub sequence: String, + pub modified_sequence: Arc<[u8]>, // e.g., "(UniMod:1)M(UniMod:35)AAAATMAAAAR" + pub naked_sequence: Arc<[u8]>, // e.g., "MAAAATMAAAAR" + pub mods: Arc<[u8]>, // e.g., "Any_N-term;Oxidation@M" + pub mod_sites: Arc<[u8]>, // e.g., "0;1" pub charge: Option, pub precursor_mass: Option, pub nce: Option, - pub instrument: Option, + pub instrument: Option>, pub retention_time: Option, pub ion_mobility: Option, pub ccs: Option, @@ -32,17 +37,118 @@ pub struct PeptideData { } impl PeptideData { - pub fn new(sequence: &str, charge: Option, precursor_mass: Option, nce: Option, instrument: Option<&str>, retention_time: Option, ion_mobility: Option, ccs: Option, ms2_intensities: Option>>) -> Self { + pub fn new( + modified_sequence: &str, + naked_sequence: &str, + mods: &str, + mod_sites: &str, + charge: Option, + precursor_mass: Option, + nce: Option, + instrument: Option<&str>, + retention_time: Option, + ion_mobility: Option, + ccs: Option, + ms2_intensities: Option>>, + ) -> Self { Self { - sequence: sequence.to_string(), + modified_sequence: Arc::from(modified_sequence.as_bytes().to_vec().into_boxed_slice()), + naked_sequence: Arc::from(naked_sequence.as_bytes().to_vec().into_boxed_slice()), + mods: Arc::from(mods.as_bytes().to_vec().into_boxed_slice()), + mod_sites: Arc::from(mod_sites.as_bytes().to_vec().into_boxed_slice()), charge, precursor_mass, nce, - instrument: instrument.map(|s| s.to_string()), + instrument: instrument.map(|s| Arc::from(s.as_bytes().to_vec().into_boxed_slice())), retention_time, ion_mobility, ccs, - ms2_intensities + ms2_intensities, + } + } + + pub fn modified_sequence_str(&self) -> &str { + std::str::from_utf8(&self.modified_sequence).unwrap_or("") + } + + pub fn naked_sequence_str(&self) -> &str { + std::str::from_utf8(&self.naked_sequence).unwrap_or("") + } + + pub fn mods_str(&self) -> &str { + std::str::from_utf8(&self.mods).unwrap_or("") + } + + pub fn mod_sites_str(&self) -> &str { + std::str::from_utf8(&self.mod_sites).unwrap_or("") + } + + pub fn instrument_str(&self) -> Option<&str> { + self.instrument + .as_ref() + .map(|v| std::str::from_utf8(v).unwrap_or("")) + } +} + +pub struct PeptideBatchData { + pub naked_sequence: Vec>, + pub mods: Vec>, + pub mod_sites: Vec>, + pub charges: Vec>, + pub precursor_masses: Vec>, + pub nces: Vec>, + pub instruments: Vec>>, + pub retention_times: Vec>, + pub ion_mobilities: Vec>, + pub ccs: Vec>, + pub ms2_intensities: Vec>>>, +} + +impl From<&[PeptideData]> for PeptideBatchData { + fn from(slice: &[PeptideData]) -> Self { + Self { + naked_sequence: slice.iter().map(|p| Arc::clone(&p.naked_sequence)).collect(), + mods: slice.iter().map(|p| Arc::clone(&p.mods)).collect(), + mod_sites: slice.iter().map(|p| Arc::clone(&p.mod_sites)).collect(), + charges: slice.iter().map(|p| p.charge).collect(), + precursor_masses: slice.iter().map(|p| p.precursor_mass).collect(), + nces: slice.iter().map(|p| p.nce).collect(), + instruments: slice.iter().map(|p| p.instrument.clone()).collect(), + retention_times: slice.iter().map(|p| p.retention_time).collect(), + ion_mobilities: slice.iter().map(|p| p.ion_mobility).collect(), + ccs: slice.iter().map(|p| p.ccs).collect(), + ms2_intensities: slice.iter().map(|p| p.ms2_intensities.clone()).collect(), } } } + + +impl PeptideBatchData { + pub fn naked_sequence_strs(&self) -> Vec<&str> { + self.naked_sequence + .iter() + .map(|s| std::str::from_utf8(s).unwrap_or("")) + .collect() + } + + pub fn mods_strs(&self) -> Vec<&str> { + self.mods + .iter() + .map(|s| std::str::from_utf8(s).unwrap_or("")) + .collect() + } + + pub fn mod_sites_strs(&self) -> Vec<&str> { + self.mod_sites + .iter() + .map(|s| std::str::from_utf8(s).unwrap_or("")) + .collect() + } + + pub fn instrument_strs(&self) -> Vec> { + self.instruments + .iter() + .map(|opt| opt.as_ref().map(|s| std::str::from_utf8(s).unwrap_or(""))) + .collect() + } +} From 67de1dd04724763d42b770956d224e1a6db3fb5d Mon Sep 17 00:00:00 2001 From: singjc Date: Tue, 13 May 2025 13:04:45 -0400 Subject: [PATCH 46/75] refactor: Update mod_to_feature loading to use Arc for key in RTCNNTFModel and CCSCNNTFModel --- .../examples/alphapeptdeep_ccs_cnn_lstm.rs | 50 +-- .../examples/alphapeptdeep_ms2_bert.rs | 43 ++- .../examples/alphapeptdeep_rt_cnn_lstm.rs | 142 +++++--- .../src/building_blocks/featurize.rs | 111 +++++- .../src/models/ccs_cnn_lstm_model.rs | 148 +++++--- .../src/models/ccs_cnn_tf_model.rs | 28 +- .../redeem-properties/src/models/ccs_model.rs | 108 ++++-- .../src/models/model_interface.rs | 212 ++++++----- .../src/models/ms2_bert_model.rs | 72 ++-- .../redeem-properties/src/models/ms2_model.rs | 83 ++++- .../src/models/rt_cnn_lstm_model.rs | 341 ++++++++++-------- .../src/models/rt_cnn_transformer_model.rs | 11 +- .../redeem-properties/src/models/rt_model.rs | 123 +++++-- .../src/utils/peptdeep_utils.rs | 28 ++ 14 files changed, 981 insertions(+), 519 deletions(-) diff --git a/crates/redeem-properties/examples/alphapeptdeep_ccs_cnn_lstm.rs b/crates/redeem-properties/examples/alphapeptdeep_ccs_cnn_lstm.rs index 0236f0a..085cb0b 100644 --- a/crates/redeem-properties/examples/alphapeptdeep_ccs_cnn_lstm.rs +++ b/crates/redeem-properties/examples/alphapeptdeep_ccs_cnn_lstm.rs @@ -6,25 +6,31 @@ use redeem_properties::{ model_interface::{ModelInterface, PredictionResult}, }, utils::{ - data_handling::{PeptideData}, - peptdeep_utils::{ion_mobility_to_ccs_bruker}, + data_handling::PeptideData, + peptdeep_utils::ion_mobility_to_ccs_bruker, }, }; use std::{path::PathBuf, sync::Arc}; -struct PredictionContext<'a> { - peptides: Vec<&'a str>, - mods: Vec<&'a str>, - mod_sites: Vec<&'a str>, +struct PredictionContext { + peptides: Vec>, + mods: Vec>, + mod_sites: Vec>, charges: Vec, observed_ccs: Vec, } -impl<'a> PredictionContext<'a> { - fn new(test_peptides: &'a [(&'a str, &'a str, &'a str, i32, f32)]) -> Self { - let peptides = test_peptides.iter().map(|(pep, _, _, _, _)| *pep).collect(); - let mods = test_peptides.iter().map(|(_, m, _, _, _)| *m).collect(); - let mod_sites = test_peptides.iter().map(|(_, _, s, _, _)| *s).collect(); +impl PredictionContext { + fn new(test_peptides: &[(&str, &str, &str, i32, f32)]) -> Self { + let peptides = test_peptides.iter() + .map(|(pep, _, _, _, _)| Arc::from(pep.as_bytes().to_vec().into_boxed_slice())) + .collect(); + let mods = test_peptides.iter() + .map(|(_, m, _, _, _)| Arc::from(m.as_bytes().to_vec().into_boxed_slice())) + .collect(); + let mod_sites = test_peptides.iter() + .map(|(_, _, s, _, _)| Arc::from(s.as_bytes().to_vec().into_boxed_slice())) + .collect(); let charges = test_peptides.iter().map(|(_, _, _, c, _)| *c).collect(); let observed_ccs = test_peptides.iter().map(|(_, _, _, _, ccs)| *ccs).collect(); @@ -48,18 +54,20 @@ fn run_prediction(model: &mut CCSCNNLSTMModel, ctx: &PredictionContext) -> Resul None, )? { PredictionResult::CCSResult(preds) => { - let total_error: f32 = preds - .iter() - .zip(ctx.observed_ccs.iter()) + let total_error: f32 = preds.iter().zip(ctx.observed_ccs.iter()) .map(|(pred, obs)| (pred - obs).abs()) .sum(); - for (pep, pred, obs) in itertools::izip!(&ctx.peptides, &preds, &ctx.observed_ccs) { - println!("Peptide: {}, Predicted CCS: {:.4}, Observed CCS: {:.4}", pep, pred, obs); + for ((pep, pred), obs) in ctx.peptides.iter().zip(preds.clone()).zip(&ctx.observed_ccs) { + println!( + "Peptide: {}, Predicted CCS: {:.4}, Observed CCS: {:.4}", + std::str::from_utf8(pep).unwrap_or(""), + pred, + obs + ); } - let mae = total_error / preds.len() as f32; - println!("Mean Absolute Error: {:.6}", mae); + println!("Mean Absolute Error: {:.6}", total_error / preds.len() as f32); } _ => println!("Unexpected prediction result type."), } @@ -71,8 +79,6 @@ fn main() -> Result<()> { let constants_path = PathBuf::from("data/models/alphapeptdeep/generic/ccs.pth.model_const.yaml"); let device = Device::new_cuda(0).unwrap_or(Device::Cpu); - println!("Device: {:?}", device); - let mut model = CCSCNNLSTMModel::new( &model_path, Some(&constants_path), @@ -97,7 +103,5 @@ fn main() -> Result<()> { ]; let ctx = PredictionContext::new(&test_peptides); - run_prediction(&mut model, &ctx)?; - - Ok(()) + run_prediction(&mut model, &ctx) } diff --git a/crates/redeem-properties/examples/alphapeptdeep_ms2_bert.rs b/crates/redeem-properties/examples/alphapeptdeep_ms2_bert.rs index cf7b8f7..a1a7921 100644 --- a/crates/redeem-properties/examples/alphapeptdeep_ms2_bert.rs +++ b/crates/redeem-properties/examples/alphapeptdeep_ms2_bert.rs @@ -18,28 +18,35 @@ use std::{ fn run_prediction(model: &mut MS2BertModel, batch_data: &[PeptideData]) -> Result<()> { let batch = PeptideBatchData::from(batch_data); - let peptides = batch.naked_sequence_strs(); - let mods = batch.mods_strs(); - let mod_sites = batch.mod_sites_strs(); - - let charges = if batch.charges.iter().all(|c| c.is_some()) { - Some(batch.charges.iter().map(|c| c.unwrap()).collect()) - } else { - None - }; - let nces = if batch.nces.iter().all(|n| n.is_some()) { - Some(batch.nces.iter().map(|n| n.unwrap()).collect()) - } else { - None - }; let instruments = if batch.instruments.iter().all(|i| i.is_some()) { - let flat: Vec<&str> = batch.instrument_strs().into_iter().map(|opt| opt.unwrap()).collect(); - Some(flat) + Some( + batch + .instruments + .iter() + .map(|opt| opt.as_ref().map(|a| Arc::clone(a))) + .collect::>(), + ) } else { None }; - - let predictions = model.predict(&peptides, &mods, &mod_sites, charges, nces, instruments.as_ref())?; + + + let predictions = model.predict( + &batch.naked_sequence, + &batch.mods, + &batch.mod_sites, + if batch.charges.iter().all(|c| c.is_some()) { + Some(batch.charges.iter().map(|c| c.unwrap()).collect()) + } else { + None + }, + if batch.nces.iter().all(|n| n.is_some()) { + Some(batch.nces.iter().map(|n| n.unwrap()).collect()) + } else { + None + }, + instruments, + )?; if let PredictionResult::MS2Result(ms2_preds) = predictions { let total_error: f32 = ms2_preds diff --git a/crates/redeem-properties/examples/alphapeptdeep_rt_cnn_lstm.rs b/crates/redeem-properties/examples/alphapeptdeep_rt_cnn_lstm.rs index f206ad7..25026e2 100644 --- a/crates/redeem-properties/examples/alphapeptdeep_rt_cnn_lstm.rs +++ b/crates/redeem-properties/examples/alphapeptdeep_rt_cnn_lstm.rs @@ -6,65 +6,68 @@ use redeem_properties::{ rt_cnn_lstm_model::RTCNNLSTMModel, }, utils::{ - data_handling::{PeptideData, PeptideBatchData}, - peptdeep_utils::{load_modifications, remove_mass_shift, get_modification_string, get_modification_indices}, + data_handling::PeptideData, + peptdeep_utils::{ + get_modification_indices, get_modification_string, load_modifications, + remove_mass_shift, + }, }, }; use std::path::PathBuf; -use std::sync::Arc; -struct PredictionContext { - peptides: Vec<&'static str>, - mods: Vec<&'static str>, - mod_sites: Vec<&'static str>, - observed_rts: Vec, -} - -impl PredictionContext { - fn new(test_peptides: &[(&'static str, &'static str, &'static str, f32)]) -> Self { - let peptides = test_peptides.iter().map(|(pep, _, _, _)| *pep).collect(); - let mods = test_peptides.iter().map(|(_, m, _, _)| *m).collect(); - let mod_sites = test_peptides.iter().map(|(_, _, site, _)| *site).collect(); - let observed_rts = test_peptides.iter().map(|(_, _, _, rt)| *rt).collect(); - - Self { - peptides, - mods, - mod_sites, - observed_rts, - } - } -} - -fn run_prediction(model: &mut RTCNNLSTMModel, context: &PredictionContext) -> Result<()> { - match model.predict( - &context.peptides, - &context.mods, - &context.mod_sites, +fn run_prediction(model: &mut RTCNNLSTMModel, batch_data: &[PeptideData]) -> Result<()> { + let batch = redeem_properties::utils::data_handling::PeptideBatchData::from(batch_data); + + let predictions = model.predict( + &batch.naked_sequence, + &batch.mods, + &batch.mod_sites, + if batch.charges.iter().all(|c| c.is_some()) { + Some(batch.charges.iter().map(|c| c.unwrap()).collect()) + } else { + None + }, None, - None, - None, - ) { - Ok(preds) => { - if let PredictionResult::RTResult(rt_preds) = preds { - let total_error: f32 = rt_preds + if batch.instruments.iter().all(|i| i.is_some()) { + Some( + batch + .instruments .iter() - .zip(&context.observed_rts) - .map(|(p, o)| (p - o).abs()) - .sum(); - - for ((pep, pred), obs) in context.peptides.iter().zip(rt_preds.iter()).zip(&context.observed_rts) { - println!("Peptide: {}, Predicted RT: {:.6}, Observed RT: {:.6}", pep, pred, obs); - } - - println!( - "Mean Absolute Error: {:.6}", - total_error / rt_preds.len() as f32 - ); - } + .map(|opt| opt.as_ref().map(|a| a.clone())) + .collect::>(), + ) + } else { + None + }, + )?; + + if let PredictionResult::RTResult(rt_preds) = predictions { + let total_error: f32 = rt_preds + .iter() + .zip(batch.retention_times.iter()) + .map(|(pred, obs)| (pred - obs.unwrap_or_default()).abs()) + .sum(); + + for ((seq, pred), obs) in batch + .naked_sequence + .iter() + .zip(rt_preds.iter()) + .zip(batch.retention_times.iter()) + { + println!( + "Peptide: {}, Predicted RT: {:.6}, Observed RT: {:.6}", + std::str::from_utf8(seq).unwrap_or(""), + pred, + obs.unwrap_or_default() + ); } - Err(e) => println!("Prediction error: {e}"), + + println!( + "Mean Absolute Error: {:.6}", + total_error / rt_preds.len() as f32 + ); } + Ok(()) } @@ -74,7 +77,6 @@ fn main() -> Result<()> { let model_path = PathBuf::from("/home/singjc/Documents/github/redeem/rt_fine_tuned.safetensors"); let constants_path = PathBuf::from("/home/singjc/Documents/github/redeem/crates/redeem-properties/data/models/alphapeptdeep/generic/rt.pth.model_const.yaml"); let device = Device::new_cuda(0).unwrap_or(Device::Cpu); - println!("Device: {:?}", device); let mut model = RTCNNLSTMModel::new(&model_path, Some(&constants_path), 0, 8, 4, true, device) @@ -117,21 +119,45 @@ fn main() -> Result<()> { }) .collect(); - let test_peptides_data = vec![ + let test_peptides = vec![ ("QPYAVSELAGHQTSAESWGTGR", "", "", 0.4328955), ("GMSVSDLADKLSTDDLNSLIAHAHR", "Oxidation@M", "1", 0.6536107), - ("TVQHHVLFTDNMVLICR", "Oxidation@M;Carbamidomethyl@C", "11;15", 0.7811949), + ( + "TVQHHVLFTDNMVLICR", + "Oxidation@M;Carbamidomethyl@C", + "11;15", + 0.7811949, + ), ("EAELDVNEELDKK", "", "", 0.2934583), ("YTPVQQGPVGVNVTYGGDPIPK", "", "", 0.5863009), ]; - let prediction_context = PredictionContext::new(&test_peptides_data); - - run_prediction(&mut model, &prediction_context)?; + let prediction_data: Vec = test_peptides + .into_iter() + .map(|(seq, mods, sites, rt)| { + let naked = remove_mass_shift(seq); + PeptideData::new( + seq, + &naked, + mods, + sites, + None, + None, + None, + None, + Some(rt), + None, + None, + None, + ) + }) + .collect(); + + run_prediction(&mut model, &prediction_data)?; model.fine_tune(&training_data, modifications, 10, 0.001, 5)?; - run_prediction(&mut model, &prediction_context)?; + run_prediction(&mut model, &prediction_data)?; model.save("alphapeptdeep_rt_cnn_lstm_finetuned.safetensors")?; diff --git a/crates/redeem-properties/src/building_blocks/featurize.rs b/crates/redeem-properties/src/building_blocks/featurize.rs index f7628f2..9317556 100644 --- a/crates/redeem-properties/src/building_blocks/featurize.rs +++ b/crates/redeem-properties/src/building_blocks/featurize.rs @@ -1,5 +1,5 @@ use anyhow::{Result, anyhow}; -use std::collections::HashMap; +use std::{collections::HashMap, sync::Arc}; use candle_core::{DType, Device, Tensor}; use rayon::prelude::*; use std::sync::atomic::{AtomicU32, Ordering}; @@ -39,6 +39,26 @@ pub fn aa_indices_tensor(seq: &str, device: &Device) -> Result { Ok(Tensor::from_slice(&indices, (1, indices.len()), device)?.to_dtype(DType::F32)?.unsqueeze(2)?) } +/// Convert peptide sequences into AA ID array using Arc<[u8]>. +/// This avoids converting the whole sequence to a String or &str unless necessary. +pub fn aa_indices_tensor_from_arc(seq: &Arc<[u8]>, device: &Device) -> Result { + let map = aa_index_map(); + let filtered: Vec = seq + .iter() + .map(|&b| { + let c = b as char; + map.get(&c) + .copied() + .ok_or_else(|| anyhow!("Unknown amino acid character: '{}'", c)) + }) + .collect::>>()?; + + let mut indices = vec![0i64]; // padding start + indices.extend(filtered); + indices.push(0); // padding end + + Ok(Tensor::from_slice(&indices, (1, indices.len()), device)?.to_dtype(DType::F32)?.unsqueeze(2)?) +} /// One-hot encode amino acid indices and concatenate additional tensors. pub fn aa_one_hot(aa_indices: &Tensor, cat_others: &[&Tensor]) -> Result { @@ -58,8 +78,6 @@ pub fn aa_one_hot(aa_indices: &Tensor, cat_others: &[&Tensor]) -> Result } } } - - let mut one_hot_data = vec![0.0f32; batch_size * seq_len * num_classes]; @@ -166,10 +184,59 @@ pub fn get_mod_features_from_parsed( } +pub fn get_mod_features_from_parsed_arc( + mod_names: &[Arc<[u8]>], + mod_sites: &[usize], + seq_len: usize, + mod_feature_size: usize, + mod_to_feature: &HashMap, Vec>, + device: &Device, +) -> Result { + let atomic_buffer: Vec = (0..seq_len * mod_feature_size) + .map(|_| AtomicU32::new(0)) + .collect(); + + mod_names + .par_iter() + .zip(mod_sites.par_iter()) + .for_each(|(mod_name, &site)| { + if site >= seq_len { + log::warn!( + "Skipping mod {:?} at invalid site {} (seq_len {})", + std::str::from_utf8(mod_name).unwrap_or(""), + site, + seq_len + ); + return; + } + if let Some(feat) = mod_to_feature.get(mod_name) { + for (i, &val) in feat.iter().enumerate() { + let idx = site * mod_feature_size + i; + let val_bits = val.to_bits(); + atomic_buffer[idx].fetch_add(val_bits, Ordering::Relaxed); + } + } else { + log::warn!( + "Unknown modification feature: {:?}", + std::str::from_utf8(mod_name).unwrap_or("") + ); + } + }); + + let mod_x: Vec = atomic_buffer + .into_iter() + .map(|a| f32::from_bits(a.load(Ordering::Relaxed))) + .collect(); + + Tensor::from_slice(&mod_x, (1, seq_len, mod_feature_size), device) + .map_err(|e| anyhow!("Failed to create tensor: {}", e)) +} + + #[cfg(test)] mod tests { - use crate::utils::peptdeep_utils::load_mod_to_feature; + use crate::utils::peptdeep_utils::{load_mod_to_feature, load_mod_to_feature_arc}; use crate::utils::peptdeep_utils::parse_model_constants; use crate::utils::peptdeep_utils::ModelConstants; @@ -183,13 +250,23 @@ mod tests { fn test_aa_indices_tensor(){ let device = Device::Cpu; let seq = "AGHCEWQMKYR"; + let start_time = std::time::Instant::now(); let result = aa_indices_tensor(seq, &device).unwrap(); + println!("aa_indices_tensor Time taken: {:?}", start_time.elapsed()); // expected result is [[0, 1, 7, 8, 3, 5, 23, 17, 13, 11, 25, 18, 0]] let expect_out = Tensor::from_vec(vec!{0.0f32, 1.0f32, 7.0f32, 8.0f32, 3.0f32, 5.0f32, 23.0f32, 17.0f32, 13.0f32, 11.0f32, 25.0f32, 18.0f32, 0.0f32}, (1, 13), &device).unwrap(); println!("{:?} - aa_indices_tensor: {:?}", seq, result.to_vec3::().unwrap()); println!("result shape: {:?}", result.shape()); assert_eq!(result.shape().dims(), &[1, 13, 1]); // assert_eq!(result.to_vec3::().unwrap(), expect_out.to_vec3::().unwrap()); + + let seq_bytes = Arc::from(seq.as_bytes().to_vec().into_boxed_slice()); + let start_time = std::time::Instant::now(); + let result = aa_indices_tensor_from_arc(&seq_bytes, &device).unwrap(); + println!("aa_indices_tensor_from_arc Time taken: {:?}", start_time.elapsed()); + println!("{:?} - aa_indices_tensor_from_arc: {:?}", seq, result.to_vec3::().unwrap()); + assert_eq!(result.shape().dims(), &[1, 13, 1]); + // assert_eq!(result.to_vec3::().unwrap(), expect_out.to_vec3::().unwrap()); } #[test] @@ -214,6 +291,7 @@ mod tests { let mod_to_feature: HashMap> = load_mod_to_feature(&constants).unwrap(); let device = Device::Cpu; + let start_time = std::time::Instant::now(); let tensor = get_mod_features_from_parsed( &mod_names, &mod_sites, @@ -222,10 +300,35 @@ mod tests { &mod_to_feature, &device, ).unwrap(); + println!("get_mod_features_from_parsed Time taken: {:?}", start_time.elapsed()); println!("tensor shape: {:?}", tensor.shape()); assert_eq!(tensor.shape().dims(), &[1, seq_len, mod_feature_size]); + let mod_to_feature = load_mod_to_feature_arc(&constants).unwrap(); + let mod_names_arc: Vec> = mod_names + .iter() + .map(|&s| Arc::from(s.as_bytes().to_vec().into_boxed_slice())) + .collect(); + let mod_sites_arc: Vec = mod_sites + .iter() + .map(|&s| s) + .collect(); + let start_time = std::time::Instant::now(); + let tensor_arc = get_mod_features_from_parsed_arc( + &mod_names_arc, + &mod_sites_arc, + seq_len, + mod_feature_size, + &mod_to_feature, + &device, + ).unwrap(); + println!("get_mod_features_from_parsed_arc Time taken: {:?}", start_time.elapsed()); + println!("tensor_arc shape: {:?}", tensor_arc.shape()); + assert_eq!(tensor_arc.shape().dims(), &[1, seq_len, mod_feature_size]); + assert_eq!(tensor.shape(), tensor_arc.shape()); + + } } \ No newline at end of file diff --git a/crates/redeem-properties/src/models/ccs_cnn_lstm_model.rs b/crates/redeem-properties/src/models/ccs_cnn_lstm_model.rs index 58b08cb..b9048e4 100644 --- a/crates/redeem-properties/src/models/ccs_cnn_lstm_model.rs +++ b/crates/redeem-properties/src/models/ccs_cnn_lstm_model.rs @@ -1,21 +1,20 @@ use anyhow::Result; use candle_core::{DType, Device, IndexOp, Tensor}; -use candle_nn::{ - Dropout, Module, VarBuilder, VarMap, -}; +use candle_nn::{Dropout, Module, VarBuilder, VarMap}; use std::collections::HashMap; -use std::{fmt, vec}; use std::path::Path; +use std::sync::Arc; +use std::{fmt, vec}; use crate::building_blocks::building_blocks::{ DecoderLinear, Encoder26aaModChargeCnnLstmAttnSum, MOD_FEATURE_SIZE, }; use crate::{ - models::model_interface::{ModelInterface, PropertyType,load_tensors_from_model, create_var_map}, - utils::peptdeep_utils::{ - load_mod_to_feature, parse_model_constants, ModelConstants, + models::model_interface::{ + create_var_map, load_tensors_from_model, ModelInterface, PropertyType, }, + utils::peptdeep_utils::{load_mod_to_feature_arc, parse_model_constants, ModelConstants}, }; // Constants @@ -29,7 +28,7 @@ pub struct CCSCNNLSTMModel { var_store: VarBuilder<'static>, varmap: VarMap, constants: ModelConstants, - mod_to_feature: HashMap>, + mod_to_feature: HashMap, Vec>, fixed_sequence_len: usize, // Total number of fragment types of a fragmentation position to predict num_frag_types: usize, @@ -55,11 +54,10 @@ impl ModelInterface for CCSCNNLSTMModel { } fn model_arch(&self) -> &'static str { - "ccs_cnn_lstm" + "ccs_cnn_lstm" } - fn new_untrained(_device: Device) -> Result - { + fn new_untrained(_device: Device) -> Result { unimplemented!("Untrained model creation is not implemented for this architecture."); } @@ -71,7 +69,7 @@ impl ModelInterface for CCSCNNLSTMModel { num_frag_types: usize, num_modloss_types: usize, mask_modloss: bool, - device: Device + device: Device, ) -> Result { let tensor_data = load_tensors_from_model(model_path.as_ref(), &device)?; @@ -86,7 +84,7 @@ impl ModelInterface for CCSCNNLSTMModel { }; // Load the mod_to_feature mapping - let mod_to_feature = load_mod_to_feature(&constants)?; + let mod_to_feature = load_mod_to_feature_arc(&constants)?; let dropout = Dropout::new(0.1); @@ -95,9 +93,7 @@ impl ModelInterface for CCSCNNLSTMModel { 8, 128, 2, - vec![ - "ccs_encoder.mod_nn.nn.weight" - ], + vec!["ccs_encoder.mod_nn.nn.weight"], vec![ "ccs_encoder.input_cnn.cnn_short.weight", "ccs_encoder.input_cnn.cnn_medium.weight", @@ -106,11 +102,12 @@ impl ModelInterface for CCSCNNLSTMModel { vec![ "ccs_encoder.input_cnn.cnn_short.bias", "ccs_encoder.input_cnn.cnn_medium.bias", - "ccs_encoder.input_cnn.cnn_long.bias" + "ccs_encoder.input_cnn.cnn_long.bias", ], "ccs_encoder.hidden_nn", - vec!["ccs_encoder.attn_sum.attn.0.weight"] - ).unwrap(); + vec!["ccs_encoder.attn_sum.attn.0.weight"], + ) + .unwrap(); let ccs_decoder = DecoderLinear::from_varstore( &var_store, @@ -138,10 +135,9 @@ impl ModelInterface for CCSCNNLSTMModel { is_training: false, dropout, ccs_encoder, - ccs_decoder + ccs_decoder, }) } - fn forward(&self, xs: &Tensor) -> Result { let (_batch_size, _seq_len, _) = xs.shape().dims3()?; @@ -154,15 +150,17 @@ impl ModelInterface for CCSCNNLSTMModel { let mod_x_out = xs.i((.., .., start_mod_x..start_mod_x + MOD_FEATURE_SIZE))?; let charge_out = xs.i((.., 0..1, start_charge..start_charge + 1))?; let charge_out = charge_out.squeeze(2)?; - - let x = self.ccs_encoder.forward(&aa_indices_out, &mod_x_out, &charge_out)?; + + let x = self + .ccs_encoder + .forward(&aa_indices_out, &mod_x_out, &charge_out)?; let x = self.dropout.forward(&x, true)?; let x = Tensor::cat(&[x, charge_out], 1)?; let x = self.ccs_decoder.forward(&x)?; Ok(x.squeeze(1)?) } - + /// Set model to evaluation mode for inference /// This disables dropout and other training-specific layers. fn set_evaluation_mode(&mut self) { @@ -192,35 +190,35 @@ impl ModelInterface for CCSCNNLSTMModel { self.constants.mod_elements.len() } - fn get_mod_to_feature(&self) -> &HashMap> { + fn get_mod_to_feature(&self) -> &HashMap, Vec> { &self.mod_to_feature } fn get_min_pred_intensity(&self) -> f32 { - unimplemented!("Method not implemented for architecture: {}", self.model_arch()) + unimplemented!( + "Method not implemented for architecture: {}", + self.model_arch() + ) } - fn get_mut_varmap(&mut self) -> &mut VarMap { &mut self.varmap } - + fn print_summary(&self) { todo!() } - + fn print_weights(&self) { todo!() } - } - // // Forward Module Trait Implementation // impl Module for CCSCNNLSTMModel { // fn forward(&self, input: &Tensor) -> Result { // ModelInterface::forward(self, input) -// } +// } // } impl fmt::Debug for CCSCNNLSTMModel { @@ -239,14 +237,26 @@ impl fmt::Debug for CCSCNNLSTMModel { // CNN writeln!(f, " (input_cnn): SeqCNN(")?; - writeln!(f, " (cnn_short): Conv1d(36, 36, kernel_size=(3,), stride=(1,), padding=(1,))")?; - writeln!(f, " (cnn_medium): Conv1d(36, 36, kernel_size=(5,), stride=(1,), padding=(2,))")?; - writeln!(f, " (cnn_long): Conv1d(36, 36, kernel_size=(7,), stride=(1,), padding=(3,))")?; + writeln!( + f, + " (cnn_short): Conv1d(36, 36, kernel_size=(3,), stride=(1,), padding=(1,))" + )?; + writeln!( + f, + " (cnn_medium): Conv1d(36, 36, kernel_size=(5,), stride=(1,), padding=(2,))" + )?; + writeln!( + f, + " (cnn_long): Conv1d(36, 36, kernel_size=(7,), stride=(1,), padding=(3,))" + )?; writeln!(f, " )")?; // Hidden LSTM writeln!(f, " (hidden_nn): SeqLSTM(")?; - writeln!(f, " (rnn): LSTM(144, 128, num_layers=2, batch_first=True, bidirectional=True)")?; + writeln!( + f, + " (rnn): LSTM(144, 128, num_layers=2, batch_first=True, bidirectional=True)" + )?; writeln!(f, " )")?; // Attention Sum @@ -275,21 +285,21 @@ impl fmt::Debug for CCSCNNLSTMModel { " (2): Linear(in_features=64, out_features=1, bias=True)" )?; writeln!(f, " )")?; - + writeln!(f, " )")?; - + write!(f, ")") } } - #[cfg(test)] mod tests { use super::*; - use crate::models::model_interface::ModelInterface; use crate::models::ccs_cnn_lstm_model::CCSCNNLSTMModel; + use crate::models::model_interface::ModelInterface; use candle_core::Device; use std::path::PathBuf; + use std::sync::Arc; #[test] fn test_load_pretrained_ccs_cnn_lstm_model() { @@ -297,7 +307,8 @@ mod tests { let constants_path = PathBuf::from("data/models/alphapeptdeep/generic/ccs.pth.model_const.yaml"); let device = Device::Cpu; - let model = CCSCNNLSTMModel::new(model_path, Some(constants_path), 0, 8, 4, true, device).unwrap(); + let model = + CCSCNNLSTMModel::new(model_path, Some(constants_path), 0, 8, 4, true, device).unwrap(); println!("{:?}", model); } @@ -308,17 +319,29 @@ mod tests { let constants_path = PathBuf::from("data/models/alphapeptdeep/generic/ccs.pth.model_const.yaml"); let device = Device::Cpu; - let model = CCSCNNLSTMModel::new(model_path, Some(constants_path), 0, 8, 4, true, device).unwrap(); - - let peptide_sequences = "AGHCEWQMKYR"; - let mods = "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M"; - let mod_sites = "0;4;8"; + let model = + CCSCNNLSTMModel::new(model_path, Some(constants_path), 0, 8, 4, true, device).unwrap(); + + let peptide_sequences = Arc::from("AGHCEWQMKYR".as_bytes().to_vec().into_boxed_slice()); + let mods = Arc::from( + "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M" + .as_bytes() + .to_vec() + .into_boxed_slice(), + ); + let mod_sites = Arc::from("0;4;8".as_bytes().to_vec().into_boxed_slice()); let charge = Some(2); let nce = Some(20); - let instrument = Some("QE"); + let instrument = Some(Arc::from("QE".as_bytes().to_vec().into_boxed_slice())); - let result = - model.encode_peptide(&peptide_sequences, mods, mod_sites, charge, nce, instrument); + let result = model.encode_peptide( + &peptide_sequences, + &mods, + &mod_sites, + charge, + nce, + instrument.as_ref(), + ); println!("{:?}", result); @@ -328,22 +351,31 @@ mod tests { } #[test] - fn test_predict(){ + fn test_predict() { let model_path = PathBuf::from("data/models/alphapeptdeep/generic/ccs.pth"); let constants_path = PathBuf::from("data/models/alphapeptdeep/generic/ccs.pth.model_const.yaml"); let device = Device::Cpu; - let model = CCSCNNLSTMModel::new(model_path, Some(constants_path), 0, 8, 4, true, device).unwrap(); - let peptide_sequences = vec!["AGHCEWQMKYR", "AGHCEWQMKYR"]; - let mods = vec!["Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M", "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M"]; - let mod_sites = vec!["0;4;8", "0;4;8"]; + let model = + CCSCNNLSTMModel::new(model_path, Some(constants_path), 0, 8, 4, true, device).unwrap(); + + let seq: Arc<[u8]> = Arc::from(b"AGHCEWQMKYR".to_vec().into_boxed_slice()); + let mods: Arc<[u8]> = Arc::from( + b"Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M" + .to_vec() + .into_boxed_slice(), + ); + let mod_sites: Arc<[u8]> = Arc::from(b"0;4;8".to_vec().into_boxed_slice()); + + let peptide_sequences = vec![seq.clone(), seq]; + let mods = vec![mods.clone(), mods]; + let mod_sites = vec![mod_sites.clone(), mod_sites]; let charge = Some(vec![2, 2]); let result = model.predict(&peptide_sequences, &mods, &mod_sites, charge, None, None); println!("{:?}", result); - } - - -} \ No newline at end of file + assert!(result.is_ok()); + } +} diff --git a/crates/redeem-properties/src/models/ccs_cnn_tf_model.rs b/crates/redeem-properties/src/models/ccs_cnn_tf_model.rs index e1e0ff2..9499f19 100644 --- a/crates/redeem-properties/src/models/ccs_cnn_tf_model.rs +++ b/crates/redeem-properties/src/models/ccs_cnn_tf_model.rs @@ -3,13 +3,14 @@ use candle_core::{DType, Device, IndexOp, Tensor}; use candle_nn::{Dropout, Module, VarBuilder, VarMap}; use std::collections::HashMap; use std::path::Path; +use std::sync::Arc; use crate::building_blocks::building_blocks::{ DecoderLinear, Encoder26aaModChargeCnnTransformerAttnSum, MOD_FEATURE_SIZE, }; use crate::models::model_interface::{ModelInterface, PropertyType, load_tensors_from_model, create_var_map}; use crate::utils::peptdeep_utils::{ - load_mod_to_feature, + load_mod_to_feature_arc, parse_model_constants, ModelConstants, }; use crate::utils::utils::get_tensor_stats; @@ -27,7 +28,7 @@ pub struct CCSCNNTFModel { varmap: VarMap, constants: ModelConstants, device: Device, - mod_to_feature: HashMap>, + mod_to_feature: HashMap, Vec>, dropout: Dropout, ccs_encoder: Encoder26aaModChargeCnnTransformerAttnSum, ccs_decoder: DecoderLinear, @@ -69,7 +70,7 @@ impl ModelInterface for CCSCNNTFModel { log::trace!("[CCSCNNTFModel] Initializing ccs_decoder"); let ccs_decoder = DecoderLinear::new(129, 1, &varbuilder.pp("ccs_decoder"))?; let constants = ModelConstants::default(); - let mod_to_feature = load_mod_to_feature(&constants)?; + let mod_to_feature = load_mod_to_feature_arc(&constants)?; Ok(Self { var_store: varbuilder, @@ -104,7 +105,7 @@ impl ModelInterface for CCSCNNTFModel { None => ModelConstants::default(), }; - let mod_to_feature = load_mod_to_feature(&constants)?; + let mod_to_feature = load_mod_to_feature_arc(&constants)?; let dropout = Dropout::new(0.1); let ccs_encoder = Encoder26aaModChargeCnnTransformerAttnSum::from_varstore( @@ -211,7 +212,7 @@ impl ModelInterface for CCSCNNTFModel { self.constants.mod_elements.len() } - fn get_mod_to_feature(&self) -> &HashMap> { + fn get_mod_to_feature(&self) -> &HashMap, Vec> { &self.mod_to_feature } @@ -257,20 +258,17 @@ mod tests { let device = Device::Cpu; let model = Box::new(CCSCNNTFModel::new_untrained(device.clone()).unwrap()); - let peptide_sequences = "AGHCEWQMKYR"; - let mods = "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M"; - let mod_sites = "0;4;8"; + let seq = Arc::from(b"AGHCEWQMKYR".to_vec().into_boxed_slice()); + let mods = + Arc::from(b"Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M".to_vec().into_boxed_slice()); + let mod_sites = Arc::from(b"0;4;8".to_vec().into_boxed_slice()); let charge = Some(2); let nce = Some(20); - let instrument = Some("QE"); + let instrument = Some(Arc::from(b"QE".to_vec().into_boxed_slice())); - let result = - model.encode_peptide(&peptide_sequences, mods, mod_sites, charge, nce, instrument); + let result = model.encode_peptide(&seq, &mods, &mod_sites, charge, nce, instrument.as_ref()); println!("{:?}", result); - - // assert!(result.is_ok()); - // let encoded_peptides = result.unwrap(); - // assert_eq!(encoded_peptides.shape().dims2().unwrap(), (1, 27 + 109 + 1 + 1 + 1)); + assert!(result.is_ok()); } } \ No newline at end of file diff --git a/crates/redeem-properties/src/models/ccs_model.rs b/crates/redeem-properties/src/models/ccs_model.rs index a9f667c..a61b2e3 100644 --- a/crates/redeem-properties/src/models/ccs_model.rs +++ b/crates/redeem-properties/src/models/ccs_model.rs @@ -1,13 +1,14 @@ -use std::path::Path; -use candle_core::Device; -use anyhow::{Result, anyhow}; -use crate::models::model_interface::{ModelInterface,PredictionResult}; use crate::models::ccs_cnn_lstm_model::CCSCNNLSTMModel; use crate::models::ccs_cnn_tf_model::CCSCNNTFModel; +use crate::models::model_interface::{ModelInterface, PredictionResult}; use crate::utils::data_handling::PeptideData; +use crate::utils::peptdeep_utils::ModificationMap; use crate::utils::stats::TrainingStepMetrics; +use anyhow::{anyhow, Result}; +use candle_core::Device; use std::collections::HashMap; -use crate::utils::peptdeep_utils::ModificationMap; +use std::path::Path; +use std::sync::Arc; // Enum for different types of CCS models pub enum CCSModelArch { @@ -26,32 +27,92 @@ pub struct CCSModelWrapper { impl Clone for CCSModelWrapper { fn clone(&self) -> Self { CCSModelWrapper { - model: self.model.clone(), + model: self.model.clone(), } } } impl CCSModelWrapper { - pub fn new>(model_path: P, constants_path: P, arch: &str, device: Device) -> Result { + pub fn new>( + model_path: P, + constants_path: P, + arch: &str, + device: Device, + ) -> Result { let model: Box = match arch { - "ccs_cnn_lstm" => Box::new(CCSCNNLSTMModel::new(model_path, Some(constants_path), 0, 8, 4, true, device)?), - "ccs_cnn_tf" => Box::new(CCSCNNTFModel::new(model_path, Some(constants_path), 0, 8, 4, true, device)?), + "ccs_cnn_lstm" => Box::new(CCSCNNLSTMModel::new( + model_path, + Some(constants_path), + 0, + 8, + 4, + true, + device, + )?), + "ccs_cnn_tf" => Box::new(CCSCNNTFModel::new( + model_path, + Some(constants_path), + 0, + 8, + 4, + true, + device, + )?), _ => return Err(anyhow!("Unsupported CCS model architecture: {}", arch)), }; Ok(Self { model }) } - pub fn predict(&self, peptide_sequence: &Vec<&str>, mods: &Vec<&str>, mod_sites: &Vec<&str>, charge: Vec) -> Result { - self.model.predict(peptide_sequence, mods, mod_sites, Some(charge), None, None) + pub fn predict( + &self, + peptide_sequence: &[Arc<[u8]>], + mods: &[Arc<[u8]>], + mod_sites: &[Arc<[u8]>], + charge: Vec, + ) -> Result { + self.model + .predict(peptide_sequence, mods, mod_sites, Some(charge), None, None) } - pub fn train(&mut self, training_data: &Vec, val_data: Option<&Vec>, modifications: HashMap<(String, Option), ModificationMap>, batch_size: usize, val_batch_size: usize, learning_rate: f64, epochs: usize, early_stopping_patience: usize) -> Result { - self.model.train(training_data, val_data, modifications, batch_size, val_batch_size, learning_rate, epochs, early_stopping_patience) + pub fn train( + &mut self, + training_data: &Vec, + val_data: Option<&Vec>, + modifications: HashMap<(String, Option), ModificationMap>, + batch_size: usize, + val_batch_size: usize, + learning_rate: f64, + epochs: usize, + early_stopping_patience: usize, + ) -> Result { + self.model.train( + training_data, + val_data, + modifications, + batch_size, + val_batch_size, + learning_rate, + epochs, + early_stopping_patience, + ) } - pub fn fine_tune(&mut self, training_data: &Vec, modifications: HashMap<(String, Option), ModificationMap>, batch_size: usize, learning_rate: f64, epochs: usize) -> Result<()> { - self.model.fine_tune(training_data, modifications, batch_size, learning_rate, epochs) + pub fn fine_tune( + &mut self, + training_data: &Vec, + modifications: HashMap<(String, Option), ModificationMap>, + batch_size: usize, + learning_rate: f64, + epochs: usize, + ) -> Result<()> { + self.model.fine_tune( + training_data, + modifications, + batch_size, + learning_rate, + epochs, + ) } pub fn set_evaluation_mode(&mut self) { @@ -76,7 +137,12 @@ impl CCSModelWrapper { } // Public API Function to load a new CCS model -pub fn load_collision_cross_section_model>(model_path: P, constants_path: P, arch: &str, device: Device) -> Result { +pub fn load_collision_cross_section_model>( + model_path: P, + constants_path: P, + arch: &str, + device: Device, +) -> Result { CCSModelWrapper::new(model_path, constants_path, arch, device) } @@ -92,7 +158,7 @@ pub fn load_collision_cross_section_model>(model_path: P, constan // fn peptide_ccs_prediction() { // let model_path = PathBuf::from("data/models/alphapeptdeep/generic/ccs.pth"); // let constants_path = PathBuf::from("data/models/alphapeptdeep/generic/ccs.pth.model_const.yaml"); - + // assert!( // model_path.exists(), // "\n╔══════════════════════════════════════════════════════════════════╗\n\ @@ -107,7 +173,7 @@ pub fn load_collision_cross_section_model>(model_path: P, constan // ╚══════════════════════════════════════════════════════════════════╝\n", // model_path // ); - + // assert!( // constants_path.exists(), // "\n╔══════════════════════════════════════════════════════════════════╗\n\ @@ -124,12 +190,12 @@ pub fn load_collision_cross_section_model>(model_path: P, constan // ); // let result = load_collision_cross_section_model(&model_path, &constants_path, "ccs_cnn_lstm", Device::Cpu); - + // assert!(result.is_ok(), "Failed to load model: {:?}", result.err()); // let mut model = result.unwrap(); // // model.print_summary(); - + // // Print the model's weights // // model.print_weights(); @@ -164,4 +230,4 @@ pub fn load_collision_cross_section_model>(model_path: P, constan // }, // } // } -// } \ No newline at end of file +// } diff --git a/crates/redeem-properties/src/models/model_interface.rs b/crates/redeem-properties/src/models/model_interface.rs index 8d67d51..2353c44 100644 --- a/crates/redeem-properties/src/models/model_interface.rs +++ b/crates/redeem-properties/src/models/model_interface.rs @@ -1,11 +1,18 @@ use crate::{ - building_blocks::featurize::{self, aa_indices_tensor, get_mod_features_from_parsed}, + building_blocks::featurize::{ + self, aa_indices_tensor, aa_indices_tensor_from_arc, get_mod_features_from_parsed, + get_mod_features_from_parsed_arc, + }, models::{ccs_model::CCSModelWrapper, ms2_model::MS2ModelWrapper, rt_model::RTModelWrapper}, utils::{ - data_handling::{PeptideBatchData, PeptideData, RTNormalization}, logging::Progress, peptdeep_utils::{ + data_handling::{PeptideBatchData, PeptideData, RTNormalization}, + logging::Progress, + peptdeep_utils::{ get_modification_indices, get_modification_string, parse_instrument_index, remove_mass_shift, - }, stats::{compute_loss_stats, Metrics, TrainingPhase, TrainingStepMetrics}, utils::{get_tensor_stats, CosineWithWarmup, LRScheduler} + }, + stats::{compute_loss_stats, Metrics, TrainingPhase, TrainingStepMetrics}, + utils::{get_tensor_stats, CosineWithWarmup, LRScheduler}, }, }; use anyhow::{Context, Result}; @@ -13,9 +20,13 @@ use candle_core::{DType, Device, Tensor, Var}; use candle_nn::{Optimizer, VarMap}; use log::info; use rayon::prelude::*; -use std::{ops::{Deref, Index}, process::Output}; use std::path::Path; use std::{collections::HashMap, path::PathBuf}; +use std::{ + ops::{Deref, Index}, + process::Output, + sync::Arc, +}; // Constants const CHARGE_FACTOR: f64 = 0.1; @@ -231,30 +242,35 @@ pub trait ModelInterface: Send + Sync + ModelClone { /// Predict the property for a batch of peptide sequences. /// /// # Arguments - /// * `peptide_sequences` - A vector of peptide sequences. - /// * `mods` - A vector of strings representing the modifications for each peptide. - /// * `mod_sites` - A vector of strings representing the modification sites for each peptide. - /// * `charge` - An optional vector of charge states for each peptide. - /// * `nce` - An optional vector of nominal collision energies for each peptide. - /// * `instrument` - An optional vector of instrument names for each peptide. + /// * `peptide_sequences` - A slice of `Arc<[u8]>` containing each peptide sequence. + /// * `mods` - A slice of `Arc<[u8]>` with modifications for each peptide. + /// * `mod_sites` - A slice of `Arc<[u8]>` representing modification sites per peptide. + /// * `charges` - Optional vector of charge states. + /// * `nces` - Optional vector of normalized collision energies. + /// * `instruments` - Optional vector of instrument names as `Arc<[u8]>`. /// /// # Returns - /// A vector of predicted retention times. + /// A `PredictionResult` containing either RT, CCS, or MS2 predictions. fn predict( &self, - peptide_sequences: &Vec<&str>, - mods: &Vec<&str>, - mod_sites: &Vec<&str>, - charge: Option>, - nce: Option>, - instrument: Option<&Vec<&str>>, + peptide_sequences: &[Arc<[u8]>], + mods: &[Arc<[u8]>], + mod_sites: &[Arc<[u8]>], + charges: Option>, + nces: Option>, + instruments: Option>>>, ) -> Result { - // Encode the batch of peptides let input_tensor = self - .encode_peptides(peptide_sequences, mods, mod_sites, charge, nce, instrument.cloned())? + .encode_peptides( + peptide_sequences, + mods, + mod_sites, + charges, + nces, + instruments, + )? .to_device(self.get_device())?; - // Forward pass through the model let output = self.forward(&input_tensor)?; match self.property_type() { @@ -268,7 +284,6 @@ pub trait ModelInterface: Send + Sync + ModelClone { } PropertyType::MS2 => { let out = self.process_predictions(&output, self.get_min_pred_intensity())?; - // Each prediction per peptide is a vector of vectors of f32, i.e. Number of fragment ions by number of ion types ordered as b_z1, b_z2, y_z1, y_z2, b_modloss_z1, b_modloss_z2, y_modloss_z1, y_modloss_z2 let predictions: Vec>> = out.to_vec3()?; Ok(PredictionResult::MS2Result(predictions)) } @@ -282,31 +297,40 @@ pub trait ModelInterface: Send + Sync + ModelClone { /// Encode peptide sequence (plus modifications) into a tensor. fn encode_peptide( &self, - peptide_sequence: &str, - mods: &str, - mod_sites: &str, + peptide_sequence: &Arc<[u8]>, + mods: &Arc<[u8]>, + mod_sites: &Arc<[u8]>, charge: Option, nce: Option, - instrument: Option<&str>, + instrument: Option<&Arc<[u8]>>, ) -> Result { let device = self.get_device(); let mod_feature_size = self.get_mod_element_count(); let mod_to_feature = self.get_mod_to_feature(); - log::trace!("[ModelInterface::encode_peptide] peptide_sequence: {} | mods: {} | mod_sites: {} | charge: {:?} | nce: {:?} | instrument: {:?}", peptide_sequence, mods, mod_sites, charge, nce, instrument); - - let aa_tensor = aa_indices_tensor(peptide_sequence, device)?; + log::trace!( + "[ModelInterface::encode_peptide] peptide_sequence: {:?} | mods: {:?} | mod_sites: {:?} | charge: {:?} | nce: {:?} | instrument: {:?}", + peptide_sequence, mods, mod_sites, charge, nce, instrument + ); + let aa_tensor = aa_indices_tensor_from_arc(peptide_sequence, device)?; let (batch_size, seq_len, _) = aa_tensor.shape().dims3()?; - let mod_names: Vec<&str> = mods.split(';').filter(|s| !s.is_empty()).collect(); - let mod_indices: Vec = mod_sites + let mod_names: Vec> = std::str::from_utf8(mods) + .unwrap_or("") + .split(';') + .filter(|s| !s.is_empty()) + .map(|s| Arc::from(s.as_bytes().to_vec().into_boxed_slice())) + .collect(); + + let mod_indices: Vec = std::str::from_utf8(mod_sites) + .unwrap_or("") .split(';') .filter(|s| !s.is_empty()) .map(|s| s.parse::().unwrap()) .collect(); - let mod_tensor = get_mod_features_from_parsed( + let mod_tensor = get_mod_features_from_parsed_arc( &mod_names, &mod_indices, seq_len, @@ -338,7 +362,8 @@ pub trait ModelInterface: Send + Sync + ModelClone { } if let Some(instr) = instrument { - let instr_idx = parse_instrument_index(instr) as u32; + let instr_str = std::str::from_utf8(instr).unwrap_or(""); + let instr_idx = parse_instrument_index(instr_str) as u32; let instr_tensor = Tensor::from_slice(&vec![instr_idx; seq_len], &[batch_size, seq_len, 1], device)? .to_dtype(DType::F32)?; @@ -349,7 +374,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { let output = features.remove(0); let (mean, min, max) = get_tensor_stats(&output)?; if !mean.is_finite() || !min.is_finite() || !max.is_finite() { - log::error!("For Peptide = {peptide_sequence} encode_peptides produced non-finite tensor stats: mean={mean}, min={min}, max={max}"); + log::error!("For Peptide = {:?} encode_peptides produced non-finite tensor stats: mean={mean}, min={min}, max={max}", peptide_sequence); anyhow::bail!("Non-finite values found in peptide encoding output."); } Ok(output) @@ -357,7 +382,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { let output = Tensor::cat(&features, 2)?; let (mean, min, max) = get_tensor_stats(&output)?; if !mean.is_finite() || !min.is_finite() || !max.is_finite() { - log::error!("For Peptide = {peptide_sequence} encode_peptides produced non-finite tensor stats: mean={mean}, min={min}, max={max}"); + log::error!("For Peptide = {:?} encode_peptides produced non-finite tensor stats: mean={mean}, min={min}, max={max}", peptide_sequence); anyhow::bail!("Non-finite values found in peptide encoding output."); } Ok(output) @@ -367,12 +392,12 @@ pub trait ModelInterface: Send + Sync + ModelClone { /// Encode a batch of peptide sequences into a tensor fn encode_peptides( &self, - peptide_sequences: &Vec<&str>, - mods: &Vec<&str>, - mod_sites: &Vec<&str>, + peptide_sequences: &[Arc<[u8]>], + mods: &[Arc<[u8]>], + mod_sites: &[Arc<[u8]>], charges: Option>, nces: Option>, - instruments: Option,>, + instruments: Option>>>, ) -> Result { let len = peptide_sequences.len(); @@ -385,14 +410,14 @@ pub trait ModelInterface: Send + Sync + ModelClone { &mod_sites[i], charges.as_ref().map(|v| v[i]), nces.as_ref().map(|v| v[i]), - instruments.as_ref().map(|v| v[i]), + instruments.as_ref().and_then(|v| v[i].as_ref()), ) }) .collect::>>()?; if tensors.is_empty() { return Err(anyhow::anyhow!( - "Encoding batch of peptides failed, the resulting tesnor batch is empty." + "Encoding batch of peptides failed, the resulting tensor batch is empty." )); } @@ -402,7 +427,6 @@ pub trait ModelInterface: Send + Sync + ModelClone { .max() .unwrap_or(0); - // Consistency check for feature dimension let expected_feat_dim = tensors .get(0) .ok_or_else(|| anyhow::anyhow!("Empty input batch"))? @@ -415,7 +439,6 @@ pub trait ModelInterface: Send + Sync + ModelClone { .map(|t| { let (_, seq_len, feat_dim) = t.shape().dims3()?; - // Check that all tensors have the same feature dimension if feat_dim != expected_feat_dim { return Err(anyhow::anyhow!( "Inconsistent feature dim: expected {}, got {}", @@ -425,7 +448,8 @@ pub trait ModelInterface: Send + Sync + ModelClone { } if seq_len < max_len { - let pad = Tensor::zeros(&[1, max_len - seq_len, feat_dim], t.dtype(), t.device())?; + let pad = + Tensor::zeros(&[1, max_len - seq_len, feat_dim], t.dtype(), t.device())?; Ok(Tensor::cat(&[&t, &pad], 1)?) } else { Ok(t) @@ -495,7 +519,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { recalls: vec![], accuracies: vec![], }; - + let mut step_idx = 0; let mut val_step_idx = 0; @@ -505,10 +529,10 @@ pub trait ModelInterface: Send + Sync + ModelClone { }; let mut opt = candle_nn::AdamW::new(self.get_mut_varmap().all_vars(), params)?; let mut lr_scheduler = CosineWithWarmup::new( - learning_rate, - warmup_steps, - total_steps, - 0.5 // one full cosine cycle + learning_rate, + warmup_steps, + total_steps, + 0.5, // one full cosine cycle ); let mut best_val_loss = f32::INFINITY; @@ -543,13 +567,15 @@ pub trait ModelInterface: Send + Sync + ModelClone { PropertyType::CCS => { let tol: Vec = targets.iter().map(|t| t * 0.02).collect(); Some(Metrics::accuracy_dynamic(&predictions, &targets, &tol)) - }, // is predicted CCS within 2% of target CCS? + } // is predicted CCS within 2% of target CCS? _ => None, }; - + step_metrics.epochs.push(epoch); step_metrics.steps.push(step_idx); - step_metrics.learning_rates.push(lr_scheduler.get_last_lr() as f64); + step_metrics + .learning_rates + .push(lr_scheduler.get_last_lr() as f64); step_metrics.losses.push(loss_val); step_metrics.phases.push(TrainingPhase::Train); step_metrics.accuracies.push(acc); @@ -583,23 +609,26 @@ pub trait ModelInterface: Send + Sync + ModelClone { .par_chunks(validation_batch_size) .enumerate() .map(|(idx, batch_data)| { - let (input_val, target_val) = self.prepare_batch_inputs(batch_data, &modifications)?; + let (input_val, target_val) = + self.prepare_batch_inputs(batch_data, &modifications)?; let predicted = self.forward(&input_val)?; let val_loss = candle_nn::loss::mse(&predicted, &target_val)?; let loss_val = val_loss.to_vec0::()?; - + let predictions = predicted.to_vec1::()?; let targets = target_val.to_vec1::()?; - + let acc = match self.property_type() { - PropertyType::RT => Some(Metrics::accuracy(&predictions, &targets, 0.5)), + PropertyType::RT => { + Some(Metrics::accuracy(&predictions, &targets, 0.5)) + } PropertyType::CCS => { let tol: Vec = targets.iter().map(|t| t * 0.02).collect(); Some(Metrics::accuracy_dynamic(&predictions, &targets, &tol)) - }, + } _ => None, }; - + Ok((loss_val, idx, lr_scheduler.get_last_lr(), acc)) }) .collect::>()?; @@ -616,7 +645,8 @@ pub trait ModelInterface: Send + Sync + ModelClone { } val_step_idx += val_results.len(); - let val_losses: Vec = val_results.iter().map(|(loss, _, _, _)| *loss).collect(); + let val_losses: Vec = + val_results.iter().map(|(loss, _, _, _)| *loss).collect(); let (avg_val_loss, std_val_loss): (f32, f32) = compute_loss_stats(&val_losses); epoch_losses.push(( @@ -652,7 +682,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { let checkpoint_path = format!( "redeem_{}_ckpt_model_epoch_{}.safetensors", self.get_model_arch(), - epoch- 1 + epoch - 1 ); // Check if the prior checkpoint exists, if it does delete it if PathBuf::from(&checkpoint_path).exists() { @@ -677,7 +707,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { let checkpoint_path = format!( "redeem_{}_ckpt_model_epoch_{}.safetensors", self.get_model_arch(), - epoch- 1 + epoch - 1 ); // Check if the prior checkpoint exists, if it does delete it if PathBuf::from(&checkpoint_path).exists() { @@ -871,6 +901,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { todo!() } + /// Perform inference over a batch of peptides. fn inference( &self, inference_data: &Vec, @@ -887,45 +918,44 @@ pub trait ModelInterface: Send + Sync + ModelClone { inference_data.len(), num_batches ); - + let progress = Progress::new(inference_data.len(), "[inference] Batch:"); let mut result: Vec> = vec![None; inference_data.len()]; - + inference_data .par_chunks(batch_size) .enumerate() .map(|(batch_idx, batch_data)| { let start_idx = batch_idx * batch_size; let batch: PeptideBatchData = batch_data.into(); - - let naked_sequences = batch.naked_sequence_strs(); - let mods = batch.mods_strs(); - let mod_sites = batch.mod_sites_strs(); - + + let naked_sequences = &batch.naked_sequence; + let mods = &batch.mods; + let mod_sites = &batch.mod_sites; + let charges = if batch.charges.iter().all(|c| c.is_some()) { Some(batch.charges.iter().map(|c| c.unwrap()).collect::>()) } else { None }; - + let nces = if batch.nces.iter().all(|n| n.is_some()) { Some(batch.nces.iter().map(|n| n.unwrap()).collect::>()) } else { None }; - + let instruments = if batch.instruments.iter().all(|i| i.is_some()) { - let flat: Vec<&str> = batch.instrument_strs().into_iter().map(|opt| opt.unwrap()).collect(); - Some(flat) + Some(batch.instruments.clone()) } else { None }; - + let input_tensor = self - .encode_peptides(&naked_sequences, &mods, &mod_sites, charges, nces, instruments)? + .encode_peptides(naked_sequences, mods, mod_sites, charges, nces, instruments)? .to_device(self.get_device())?; let output = self.forward(&input_tensor)?; - + match self.property_type() { PropertyType::RT | PropertyType::CCS => { let predictions = output.to_vec1()?; @@ -938,7 +968,9 @@ pub trait ModelInterface: Send + Sync + ModelClone { PropertyType::RT => { peptide.retention_time = Some(match rt_norm { RTNormalization::ZScore(mean, std) => pred * std + mean, - RTNormalization::MinMax(min, max) => pred * (max - min) + min, + RTNormalization::MinMax(min, max) => { + pred * (max - min) + min + } RTNormalization::None => pred, }); } @@ -962,27 +994,27 @@ pub trait ModelInterface: Send + Sync + ModelClone { result[idx] = Some(peptide); progress.inc(); }); - + progress.finish(); Ok(result.into_iter().flatten().collect()) } - /// Extract encoded input and target tensor for a batch of peptides. fn prepare_batch_inputs( &self, batch_data: &[PeptideData], - _modifications: &HashMap<(String, Option), crate::utils::peptdeep_utils::ModificationMap>, + _modifications: &HashMap< + (String, Option), + crate::utils::peptdeep_utils::ModificationMap, + >, ) -> Result<(Tensor, Tensor)> { use rayon::prelude::*; let batch: PeptideBatchData = batch_data.into(); - let naked_sequences = batch.naked_sequence_strs(); - - let mods = batch.mods_strs(); - - let mod_sites = batch.mod_sites_strs(); + let naked_sequences = &batch.naked_sequence; + let mods = &batch.mods; + let mod_sites = &batch.mod_sites; let charges = if batch.charges.iter().all(|c| c.is_some()) { Some(batch.charges.iter().map(|c| c.unwrap()).collect::>()) @@ -997,19 +1029,13 @@ pub trait ModelInterface: Send + Sync + ModelClone { }; let instruments = if batch.instruments.iter().all(|i| i.is_some()) { - let flat: Vec<&str> = batch - .instrument_strs() - .into_iter() - .map(|opt| opt.unwrap()) - .collect(); - Some(flat) + Some(batch.instruments.clone()) } else { None }; - let input_batch = self - .encode_peptides(&naked_sequences, &mods, &mod_sites, charges, nces, instruments)? + .encode_peptides(naked_sequences, mods, mod_sites, charges, nces, instruments)? .to_device(self.get_device())?; let target_values: Vec = match self.property_type() { @@ -1048,7 +1074,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { fn get_mod_element_count(&self) -> usize; - fn get_mod_to_feature(&self) -> &HashMap>; + fn get_mod_to_feature(&self) -> &HashMap, Vec>; fn get_min_pred_intensity(&self) -> f32; @@ -1106,8 +1132,6 @@ pub trait ModelInterface: Send + Sync + ModelClone { } } - - /// Parameters for the `predict` method of a `ModelInterface` implementation. #[derive(Clone)] pub struct Parameters { diff --git a/crates/redeem-properties/src/models/ms2_bert_model.rs b/crates/redeem-properties/src/models/ms2_bert_model.rs index 86ac41c..9bef37c 100644 --- a/crates/redeem-properties/src/models/ms2_bert_model.rs +++ b/crates/redeem-properties/src/models/ms2_bert_model.rs @@ -1,9 +1,9 @@ use anyhow::Result; use candle_core::{DType, Device, IndexOp, Tensor}; use candle_nn::{Dropout, Module, VarBuilder, VarMap}; -use std::collections::HashMap; use std::fmt; use std::path::Path; +use std::{collections::HashMap, sync::Arc}; use crate::{ building_blocks::building_blocks::{ @@ -13,7 +13,7 @@ use crate::{ models::model_interface::{ create_var_map, load_tensors_from_model, ModelInterface, PropertyType, }, - utils::peptdeep_utils::{load_mod_to_feature, parse_model_constants, ModelConstants}, + utils::peptdeep_utils::{load_mod_to_feature_arc, parse_model_constants, ModelConstants}, }; // Constants @@ -27,7 +27,7 @@ pub struct MS2BertModel { var_store: VarBuilder<'static>, varmap: VarMap, constants: ModelConstants, - mod_to_feature: HashMap>, + mod_to_feature: HashMap, Vec>, fixed_sequence_len: usize, // Total number of fragment types of a fragmentation position to predict num_frag_types: usize, @@ -60,8 +60,7 @@ impl ModelInterface for MS2BertModel { "ms2_bert" } - fn new_untrained(_device: Device) -> Result - { + fn new_untrained(_device: Device) -> Result { unimplemented!("Untrained model creation is not implemented for this architecture."); } @@ -88,7 +87,7 @@ impl ModelInterface for MS2BertModel { }; // Load the mod_to_feature mapping - let mod_to_feature = load_mod_to_feature(&constants)?; + let mod_to_feature = load_mod_to_feature_arc(&constants)?; let dropout = Dropout::new(0.1); @@ -355,7 +354,7 @@ impl ModelInterface for MS2BertModel { self.constants.mod_elements.len() } - fn get_mod_to_feature(&self) -> &HashMap> { + fn get_mod_to_feature(&self) -> &HashMap, Vec> { &self.mod_to_feature } @@ -461,7 +460,8 @@ mod tests { let constants_path = PathBuf::from("data/models/alphapeptdeep/generic/ms2.pth.model_const.yaml"); let device = Device::Cpu; - let model = MS2BertModel::new(model_path, Some(constants_path), 0, 8, 4, true, device).unwrap(); + let model = + MS2BertModel::new(model_path, Some(constants_path), 0, 8, 4, true, device).unwrap(); println!("{:?}", model); } @@ -472,23 +472,25 @@ mod tests { let constants_path = PathBuf::from("data/models/alphapeptdeep/generic/ms2.pth.model_const.yaml"); let device = Device::Cpu; - let model = MS2BertModel::new(model_path, Some(constants_path), 0, 8, 4, true, device).unwrap(); - - let peptide_sequences = "AGHCEWQMKYR"; - let mods = "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M"; - let mod_sites = "0;4;8"; + let model = + MS2BertModel::new(model_path, Some(constants_path), 0, 8, 4, true, device).unwrap(); + + let seq = Arc::from(b"AGHCEWQMKYR".to_vec().into_boxed_slice()); + let mods = Arc::from( + b"Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M" + .to_vec() + .into_boxed_slice(), + ); + let mod_sites = Arc::from(b"0;4;8".to_vec().into_boxed_slice()); let charge = Some(2); let nce = Some(20); - let instrument = Some("QE"); + let instrument = Some(Arc::from(b"QE".to_vec().into_boxed_slice())); let result = - model.encode_peptide(&peptide_sequences, mods, mod_sites, charge, nce, instrument); + model.encode_peptide(&seq, &mods, &mod_sites, charge, nce, instrument.as_ref()); println!("{:?}", result); - - // assert!(result.is_ok()); - // let encoded_peptides = result.unwrap(); - // assert_eq!(encoded_peptides.shape().dims2().unwrap(), (1, 27 + 109 + 1 + 1 + 1)); + assert!(result.is_ok()); } #[test] @@ -497,17 +499,26 @@ mod tests { let constants_path = PathBuf::from("data/models/alphapeptdeep/generic/ms2.pth.model_const.yaml"); let device = Device::Cpu; - let model = MS2BertModel::new(model_path, Some(constants_path), 0, 8, 4, true, device).unwrap(); - - let peptide_sequences = vec!["AGHCEWQMKYR", "AGHCEWQMKYR"]; - let mods = vec![ - "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M", - "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M", - ]; - let mod_sites = vec!["0;4;8", "0;4;8"]; + let model = + MS2BertModel::new(model_path, Some(constants_path), 0, 8, 4, true, device).unwrap(); + + let seq: Arc<[u8]> = Arc::from(b"AGHCEWQMKYR".to_vec().into_boxed_slice()); + let mods: Arc<[u8]> = Arc::from( + b"Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M" + .to_vec() + .into_boxed_slice(), + ); + let mod_sites: Arc<[u8]> = Arc::from(b"0;4;8".to_vec().into_boxed_slice()); let charge = Some(vec![2, 2]); let nce = Some(vec![20, 20]); - let instrument = Some(vec!["QE", "QE"]); + let instrument = vec![ + Arc::from(b"QE".to_vec().into_boxed_slice()), + Arc::from(b"QE".to_vec().into_boxed_slice()), + ]; + + let peptide_sequences = vec![seq.clone(), seq]; + let mods = vec![mods.clone(), mods]; + let mod_sites = vec![mod_sites.clone(), mod_sites]; let input_tensor = model .encode_peptides( @@ -516,14 +527,15 @@ mod tests { &mod_sites, charge, nce, - instrument, + Some(instrument.into_iter().map(Some).collect()), ) .unwrap(); + let output = model.forward(&input_tensor).unwrap(); println!("{:?}", output); let prediction: Vec>> = output.to_vec3().unwrap(); - println!("{:?}", prediction); + assert_eq!(prediction.len(), 2); } } diff --git a/crates/redeem-properties/src/models/ms2_model.rs b/crates/redeem-properties/src/models/ms2_model.rs index 176b7e7..0a63d72 100644 --- a/crates/redeem-properties/src/models/ms2_model.rs +++ b/crates/redeem-properties/src/models/ms2_model.rs @@ -1,11 +1,12 @@ -use std::path::Path; -use candle_core::{Device, Tensor}; -use anyhow::{Result, anyhow}; -use crate::models::model_interface::{ModelInterface,PredictionResult}; +use crate::models::model_interface::{ModelInterface, PredictionResult}; use crate::models::ms2_bert_model::MS2BertModel; use crate::utils::data_handling::PeptideData; -use std::collections::HashMap; use crate::utils::peptdeep_utils::ModificationMap; +use anyhow::{anyhow, Result}; +use candle_core::{Device, Tensor}; +use std::collections::HashMap; +use std::path::Path; +use std::sync::Arc; // Enum for different types of MS2 models pub enum MS2ModelArch { @@ -24,15 +25,28 @@ pub struct MS2ModelWrapper { impl Clone for MS2ModelWrapper { fn clone(&self) -> Self { MS2ModelWrapper { - model: self.model.clone(), + model: self.model.clone(), } } } impl MS2ModelWrapper { - pub fn new>(model_path: P, constants_path: P, arch: &str, device: Device) -> Result { + pub fn new>( + model_path: P, + constants_path: P, + arch: &str, + device: Device, + ) -> Result { let model: Box = match arch { - "ms2_bert" => Box::new(MS2BertModel::new(model_path, Some(constants_path), 0, 8, 4, true, device)?), + "ms2_bert" => Box::new(MS2BertModel::new( + model_path, + Some(constants_path), + 0, + 8, + 4, + true, + device, + )?), // Add other cases here as you implement more models _ => return Err(anyhow!("Unsupported MS2 model architecture: {}", arch)), }; @@ -40,12 +54,40 @@ impl MS2ModelWrapper { Ok(Self { model }) } - pub fn predict(&self, peptide_sequence: &Vec<&str>, mods: &Vec<&str>, mod_sites: &Vec<&str>, charge: Vec, nce: Vec, intsrument: &Vec<&str>) -> Result { - self.model.predict(peptide_sequence, mods, mod_sites, Some(charge), Some(nce), Some(intsrument)) + pub fn predict( + &self, + peptide_sequence: &[Arc<[u8]>], + mods: &[Arc<[u8]>], + mod_sites: &[Arc<[u8]>], + charge: Vec, + nce: Vec, + intsrument: Vec>>, + ) -> Result { + self.model.predict( + peptide_sequence, + mods, + mod_sites, + Some(charge), + Some(nce), + Some(intsrument), + ) } - pub fn fine_tune(&mut self, training_data: &Vec, modifications: HashMap<(String, Option), ModificationMap>, batch_size: usize, learning_rate: f64, epochs: usize) -> Result<()> { - self.model.fine_tune(training_data, modifications, batch_size, learning_rate, epochs) + pub fn fine_tune( + &mut self, + training_data: &Vec, + modifications: HashMap<(String, Option), ModificationMap>, + batch_size: usize, + learning_rate: f64, + epochs: usize, + ) -> Result<()> { + self.model.fine_tune( + training_data, + modifications, + batch_size, + learning_rate, + epochs, + ) } pub fn set_evaluation_mode(&mut self) { @@ -70,7 +112,12 @@ impl MS2ModelWrapper { } // Public API Function to load a new MS2 model -pub fn load_ms2_model>(model_path: P, constants_path: P, arch: &str, device: Device) -> Result { +pub fn load_ms2_model>( + model_path: P, + constants_path: P, + arch: &str, + device: Device, +) -> Result { MS2ModelWrapper::new(model_path, constants_path, arch, device) } @@ -86,7 +133,7 @@ pub fn load_ms2_model>(model_path: P, constants_path: P, arch: &s // fn peptide_ms2_prediction() { // let model_path = PathBuf::from("data/models/alphapeptdeep/generic/ms2.pth"); // let constants_path = PathBuf::from("data/models/alphapeptdeep/generic/ms2.pth.model_const.yaml"); - + // assert!( // model_path.exists(), // "\n╔══════════════════════════════════════════════════════════════════╗\n\ @@ -101,7 +148,7 @@ pub fn load_ms2_model>(model_path: P, constants_path: P, arch: &s // ╚══════════════════════════════════════════════════════════════════╝\n", // model_path // ); - + // assert!( // constants_path.exists(), // "\n╔══════════════════════════════════════════════════════════════════╗\n\ @@ -118,12 +165,12 @@ pub fn load_ms2_model>(model_path: P, constants_path: P, arch: &s // ); // let result = load_ms2_model(&model_path, &constants_path, "ms2_bert", Device::Cpu); - + // assert!(result.is_ok(), "Failed to load model: {:?}", result.err()); // let mut model = result.unwrap(); // // model.print_summary(); - + // // Print the model's weights // // model.print_weights(); @@ -162,4 +209,4 @@ pub fn load_ms2_model>(model_path: P, constants_path: P, arch: &s // }, // } // } -// } \ No newline at end of file +// } diff --git a/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs b/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs index a0f3c84..80434da 100644 --- a/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs +++ b/crates/redeem-properties/src/models/rt_cnn_lstm_model.rs @@ -3,20 +3,19 @@ use candle_core::{DType, Device, IndexOp, Tensor}; use candle_nn::{Dropout, Module, VarBuilder, VarMap}; use std::collections::HashMap; use std::path::Path; - - +use std::sync::Arc; use crate::building_blocks::building_blocks::{ DecoderLinear, Encoder26aaModCnnLstmAttnSum, MOD_FEATURE_SIZE, }; -use crate::models::model_interface::{ModelInterface, PropertyType, load_tensors_from_model, create_var_map}; +use crate::models::model_interface::{ + create_var_map, load_tensors_from_model, ModelInterface, PropertyType, +}; use crate::utils::peptdeep_utils::{ - load_mod_to_feature, - parse_model_constants, ModelConstants, + load_mod_to_feature_arc, parse_model_constants, ModelConstants, }; use crate::utils::utils::get_tensor_stats; - // Main Model Struct #[derive(Clone)] @@ -26,7 +25,7 @@ pub struct RTCNNLSTMModel { varmap: VarMap, constants: ModelConstants, device: Device, - mod_to_feature: HashMap>, + mod_to_feature: HashMap, Vec>, dropout: Dropout, rt_encoder: Encoder26aaModCnnLstmAttnSum, rt_decoder: DecoderLinear, @@ -45,11 +44,10 @@ impl ModelInterface for RTCNNLSTMModel { } fn model_arch(&self) -> &'static str { - "rt_cnn_lstm" + "rt_cnn_lstm" } - fn new_untrained(_device: Device) -> Result - { + fn new_untrained(_device: Device) -> Result { unimplemented!("Untrained model creation is not implemented for this architecture."); } @@ -63,9 +61,8 @@ impl ModelInterface for RTCNNLSTMModel { _mask_modloss: bool, device: Device, ) -> Result { - let tensor_data = load_tensors_from_model(model_path.as_ref(), &device)?; - + let mut varmap = candle_nn::VarMap::new(); create_var_map(&mut varmap, tensor_data, &device)?; let var_store = candle_nn::VarBuilder::from_varmap(&varmap, DType::F32, &device); @@ -76,7 +73,7 @@ impl ModelInterface for RTCNNLSTMModel { }; // Load the mod_to_feature mapping - let mod_to_feature = load_mod_to_feature(&constants)?; + let mod_to_feature = load_mod_to_feature_arc(&constants)?; // Encoder let dropout = Dropout::new(0.1); @@ -87,21 +84,33 @@ impl ModelInterface for RTCNNLSTMModel { 128, 2, vec!["rt_encoder.mod_nn.nn.weight"], - vec!["rt_encoder.input_cnn.cnn_short.weight", "rt_encoder.input_cnn.cnn_medium.weight", "rt_encoder.input_cnn.cnn_long.weight"], - vec!["rt_encoder.input_cnn.cnn_short.bias", "rt_encoder.input_cnn.cnn_medium.bias", "rt_encoder.input_cnn.cnn_long.bias"], - "rt_encoder.hidden_nn", vec![ - "rt_encoder.attn_sum.attn.0.weight", + "rt_encoder.input_cnn.cnn_short.weight", + "rt_encoder.input_cnn.cnn_medium.weight", + "rt_encoder.input_cnn.cnn_long.weight", + ], + vec![ + "rt_encoder.input_cnn.cnn_short.bias", + "rt_encoder.input_cnn.cnn_medium.bias", + "rt_encoder.input_cnn.cnn_long.bias", ], - ).unwrap(); + "rt_encoder.hidden_nn", + vec!["rt_encoder.attn_sum.attn.0.weight"], + ) + .unwrap(); let rt_decoder = DecoderLinear::from_varstore( &var_store, 256, 1, - vec!["rt_decoder.nn.0.weight", "rt_decoder.nn.1.weight", "rt_decoder.nn.2.weight"], - vec!["rt_decoder.nn.0.bias", "rt_decoder.nn.2.bias"] - ).unwrap(); + vec![ + "rt_decoder.nn.0.weight", + "rt_decoder.nn.1.weight", + "rt_decoder.nn.2.weight", + ], + vec!["rt_decoder.nn.0.bias", "rt_decoder.nn.2.bias"], + ) + .unwrap(); Ok(Self { var_store, @@ -116,21 +125,20 @@ impl ModelInterface for RTCNNLSTMModel { }) } - fn forward(&self, xs: &Tensor) -> Result { let (_batch_size, _seq_len, _) = xs.shape().dims3()?; - + let aa_indices_out = xs.i((.., .., 0))?; let (mean, min, max) = get_tensor_stats(&aa_indices_out)?; log::debug!("[RTCNNLSTMModel] aa_indices_out stats - min: {min}, max: {max}, mean: {mean}"); let mod_x_out = xs.i((.., .., 1..1 + MOD_FEATURE_SIZE))?; - + let x = self.rt_encoder.forward(&aa_indices_out, &mod_x_out)?; - + let x = self.dropout.forward(&x, self.is_training)?; - + let x = self.rt_decoder.forward(&x)?; - + let result = x.squeeze(1)?; Ok(result) @@ -165,12 +173,15 @@ impl ModelInterface for RTCNNLSTMModel { self.constants.mod_elements.len() } - fn get_mod_to_feature(&self) -> &HashMap> { + fn get_mod_to_feature(&self) -> &HashMap, Vec> { &self.mod_to_feature } fn get_min_pred_intensity(&self) -> f32 { - unimplemented!("Method not implemented for architecture: {}", self.model_arch()) + unimplemented!( + "Method not implemented for architecture: {}", + self.model_arch() + ) } fn get_mut_varmap(&mut self) -> &mut VarMap { @@ -180,7 +191,10 @@ impl ModelInterface for RTCNNLSTMModel { /// Print a summary of the model's constants. fn print_summary(&self) { println!("RTModel Summary:"); - println!("AA Embedding Size: {}", self.constants.aa_embedding_size.unwrap()); + println!( + "AA Embedding Size: {}", + self.constants.aa_embedding_size.unwrap() + ); println!("Charge Factor: {:?}", self.constants.charge_factor); println!("Instruments: {:?}", self.constants.instruments); println!("Max Instrument Num: {}", self.constants.max_instrument_num); @@ -191,7 +205,7 @@ impl ModelInterface for RTCNNLSTMModel { /// Print the model's weights. fn print_weights(&self) { println!("RTModel Weights:"); - + // Helper function to print the first 5 values of a tensor fn print_first_5_values(tensor: &Tensor, name: &str) { let shape = tensor.shape(); @@ -199,7 +213,11 @@ impl ModelInterface for RTCNNLSTMModel { // Extract the first row if let Ok(row) = tensor.i((0, ..)) { match row.to_vec1::() { - Ok(values) => println!("{} (first 5 values of first row): {:?}", name, &values[..5.min(values.len())]), + Ok(values) => println!( + "{} (first 5 values of first row): {:?}", + name, + &values[..5.min(values.len())] + ), Err(e) => eprintln!("Error printing {}: {:?}", name, e), } } else { @@ -207,13 +225,16 @@ impl ModelInterface for RTCNNLSTMModel { } } else { match tensor.to_vec1::() { - Ok(values) => println!("{} (first 5 values): {:?}", name, &values[..5.min(values.len())]), + Ok(values) => println!( + "{} (first 5 values): {:?}", + name, + &values[..5.min(values.len())] + ), Err(e) => eprintln!("Error printing {}: {:?}", name, e), } } } - - + // Print the first 5 values of each weight tensor if let Ok(tensor) = self.var_store.get((2, 103), "rt_encoder.mod_nn.nn.weight") { print_first_5_values(&tensor, "rt_encoder.mod_nn.nn.weight"); @@ -233,31 +254,58 @@ impl ModelInterface for RTCNNLSTMModel { // if let Ok(tensor) = self.var_store.get((4, 1, 128), "rt_encoder.hidden_nn.rnn_c0") { // print_first_5_values(&tensor, "rt_encoder.hidden_nn.rnn_c0"); // } - if let Ok(tensor) = self.var_store.get((512, 140), "rt_encoder.hidden_nn.rnn.weight_ih_l0") { + if let Ok(tensor) = self + .var_store + .get((512, 140), "rt_encoder.hidden_nn.rnn.weight_ih_l0") + { print_first_5_values(&tensor, "rt_encoder.hidden_nn.rnn.weight_ih_l0"); } - if let Ok(tensor) = self.var_store.get((512, 128), "rt_encoder.hidden_nn.rnn.weight_hh_l0") { + if let Ok(tensor) = self + .var_store + .get((512, 128), "rt_encoder.hidden_nn.rnn.weight_hh_l0") + { print_first_5_values(&tensor, "rt_encoder.hidden_nn.rnn.weight_hh_l0"); } - if let Ok(tensor) = self.var_store.get((512, 140), "rt_encoder.hidden_nn.rnn.weight_ih_l0_reverse") { + if let Ok(tensor) = self + .var_store + .get((512, 140), "rt_encoder.hidden_nn.rnn.weight_ih_l0_reverse") + { print_first_5_values(&tensor, "rt_encoder.hidden_nn.rnn.weight_ih_l0_reverse"); } - if let Ok(tensor) = self.var_store.get((512, 128), "rt_encoder.hidden_nn.rnn.weight_hh_l0_reverse") { + if let Ok(tensor) = self + .var_store + .get((512, 128), "rt_encoder.hidden_nn.rnn.weight_hh_l0_reverse") + { print_first_5_values(&tensor, "rt_encoder.hidden_nn.rnn.weight_hh_l0_reverse"); } - if let Ok(tensor) = self.var_store.get((512, 256), "rt_encoder.hidden_nn.rnn.weight_ih_l1") { + if let Ok(tensor) = self + .var_store + .get((512, 256), "rt_encoder.hidden_nn.rnn.weight_ih_l1") + { print_first_5_values(&tensor, "rt_encoder.hidden_nn.rnn.weight_ih_l1"); } - if let Ok(tensor) = self.var_store.get((512, 128), "rt_encoder.hidden_nn.rnn.weight_hh_l1") { + if let Ok(tensor) = self + .var_store + .get((512, 128), "rt_encoder.hidden_nn.rnn.weight_hh_l1") + { print_first_5_values(&tensor, "rt_encoder.hidden_nn.rnn.weight_hh_l1"); } - if let Ok(tensor) = self.var_store.get((512, 256), "rt_encoder.hidden_nn.rnn.weight_ih_l1_reverse") { + if let Ok(tensor) = self + .var_store + .get((512, 256), "rt_encoder.hidden_nn.rnn.weight_ih_l1_reverse") + { print_first_5_values(&tensor, "rt_encoder.hidden_nn.rnn.weight_ih_l1_reverse"); } - if let Ok(tensor) = self.var_store.get((512, 128), "rt_encoder.hidden_nn.rnn.weight_hh_l1_reverse") { + if let Ok(tensor) = self + .var_store + .get((512, 128), "rt_encoder.hidden_nn.rnn.weight_hh_l1_reverse") + { print_first_5_values(&tensor, "rt_encoder.hidden_nn.rnn.weight_hh_l1_reverse"); } - if let Ok(tensor) = self.var_store.get((1, 256), "rt_encoder.attn_sum.attn.0.weight") { + if let Ok(tensor) = self + .var_store + .get((1, 256), "rt_encoder.attn_sum.attn.0.weight") + { print_first_5_values(&tensor, "rt_encoder.attn_sum.attn.0.weight"); } if let Ok(tensor) = self.var_store.get((256, 256), "rt_decoder.nn.0.weight") { @@ -270,8 +318,6 @@ impl ModelInterface for RTCNNLSTMModel { print_first_5_values(&tensor, "rt_decoder.nn.2.weight"); } } - - } // Module Trait Implementation @@ -282,7 +328,6 @@ impl ModelInterface for RTCNNLSTMModel { // } // } - #[cfg(test)] mod tests { use crate::models::model_interface::{ModelInterface, PredictionResult}; @@ -293,7 +338,7 @@ mod tests { use super::*; #[test] - fn test_tensor_from_pth(){ + fn test_tensor_from_pth() { let model_path = PathBuf::from("data/models/alphapeptdeep/generic/rt.pth"); let tensor_data = candle_core::pickle::read_all(model_path).unwrap(); println!("{:?}", tensor_data); @@ -319,152 +364,150 @@ mod tests { let constants_path = PathBuf::from("data/models/alphapeptdeep/generic/rt.pth.model_const.yaml"); let device = Device::Cpu; - let model = RTCNNLSTMModel::new(&model_path, Some(&constants_path), 0, 8, 4, true, device).unwrap(); - - let peptide_sequences = "AGHCEWQMKYR"; - let mods = "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M"; - let mod_sites = "0;4;8"; - // let charge = Some(2); - // let nce = Some(20); - // let instrument = Some("QE"); + let model = + RTCNNLSTMModel::new(&model_path, Some(&constants_path), 0, 8, 4, true, device).unwrap(); + + let seq = Arc::from(b"AGHCEWQMKYR".to_vec().into_boxed_slice()); + let mods = Arc::from( + b"Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M" + .to_vec() + .into_boxed_slice(), + ); + let mod_sites = Arc::from(b"0;4;8".to_vec().into_boxed_slice()); + let charge = Some(2); + let nce = Some(20); + let instrument = Some(Arc::from(b"QE".to_vec().into_boxed_slice())); let result = - model.encode_peptide(&peptide_sequences, mods, mod_sites, None, None, None); + model.encode_peptide(&seq, &mods, &mod_sites, charge, nce, instrument.as_ref()); println!("{:?}", result); - - // assert!(result.is_ok()); - // let encoded_peptides = result.unwrap(); - // assert_eq!(encoded_peptides.shape().dims2().unwrap(), (1, 27 + 109 + 1 + 1 + 1)); + assert!(result.is_ok()); } #[test] fn test_encode_peptides_batch() { - let model_path = PathBuf::from("data/models/alphapeptdeep/generic/rt.pth"); - let constants_path = PathBuf::from("data/models/alphapeptdeep/generic/rt.pth.model_const.yaml"); + let constants_path = + PathBuf::from("data/models/alphapeptdeep/generic/rt.pth.model_const.yaml"); let device = Device::Cpu; - let model = RTCNNLSTMModel::new(&model_path, Some(&constants_path), 0, 8, 4, true, device.clone()).unwrap(); - - // Batched input - let peptide_sequences = vec![ - "ACDEFGHIK", - "AGHCEWQMKYR", + let model = RTCNNLSTMModel::new( + &model_path, + Some(&constants_path), + 0, + 8, + 4, + true, + device.clone(), + ) + .unwrap(); + + let naked_sequence = vec![ + Arc::from(b"ACDEFGHIK".to_vec().into_boxed_slice()), + Arc::from(b"AGHCEWQMKYR".to_vec().into_boxed_slice()), ]; let mods = vec![ - "Carbamidomethyl@C", - "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M", + Arc::from(b"Carbamidomethyl@C".to_vec().into_boxed_slice()), + Arc::from( + b"Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M" + .to_vec() + .into_boxed_slice(), + ), ]; let mod_sites = vec![ - "1", - "0;4;8", + Arc::from(b"1".to_vec().into_boxed_slice()), + Arc::from(b"0;4;8".to_vec().into_boxed_slice()), ]; - println!("Peptides: {:?}", peptide_sequences); - println!("Mods: {:?}", mods); - println!("Mod sites: {:?}", mod_sites); - - - let result = model.encode_peptides( - &peptide_sequences, - &mods, - &mod_sites, - None, - None, - None, - ); + let result = model.encode_peptides(&naked_sequence, &mods, &mod_sites, None, None, None); assert!(result.is_ok()); let tensor = result.unwrap(); println!("Batched encoded tensor shape: {:?}", tensor.shape()); let (batch, seq_len, feat_dim) = tensor.shape().dims3().unwrap(); - assert_eq!(batch, 2); // two peptides - assert!(seq_len >= 11); // padded to max length - assert!(feat_dim > 1); // includes aa + mod features + assert_eq!(batch, 2); + assert!(seq_len >= 11); + assert!(feat_dim > 1); } - #[test] fn test_prediction() { - let model_path = PathBuf::from("data/models/alphapeptdeep/generic/rt.pth"); - let constants_path = PathBuf::from("data/models/alphapeptdeep/generic/rt.pth.model_const.yaml"); + let constants_path = + PathBuf::from("data/models/alphapeptdeep/generic/rt.pth.model_const.yaml"); let device = Device::new_cuda(0).unwrap_or(Device::Cpu); - let result = RTCNNLSTMModel::new(&model_path, Some(&constants_path), 0, 8, 4, true, device); - let mut model = result.unwrap(); + let mut model = + RTCNNLSTMModel::new(&model_path, Some(&constants_path), 0, 8, 4, true, device).unwrap(); let test_peptides = vec![ - ("AGHCEWQMKYR", "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M", "0;4;8", 0.2945), + ( + "AGHCEWQMKYR", + "Acetyl@Protein N-term;Carbamidomethyl@C;Oxidation@M", + "0;4;8", + 0.2945, + ), ("QPYAVSELAGHQTSAESWGTGR", "", "", 0.4328955), ("GMSVSDLADKLSTDDLNSLIAHAHR", "Oxidation@M", "1", 0.6536107), - ("TVQHHVLFTDNMVLICR", "Oxidation@M;Carbamidomethyl@C", "11;15", 0.7811949), + ( + "TVQHHVLFTDNMVLICR", + "Oxidation@M;Carbamidomethyl@C", + "11;15", + 0.7811949, + ), ("EAELDVNEELDKK", "", "", 0.2934583), ("YTPVQQGPVGVNVTYGGDPIPK", "", "", 0.5863009), ("YYAIDFTLDEIK", "", "", 0.8048359), ("VSSLQAEPLPR", "", "", 0.3201348), - ("NHAVVCQGCHNAIDPEVQR", "Carbamidomethyl@C;Carbamidomethyl@C", "5;8", 0.1730425), + ( + "NHAVVCQGCHNAIDPEVQR", + "Carbamidomethyl@C;Carbamidomethyl@C", + "5;8", + 0.1730425, + ), ("IPNIYAIGDVVAGPMLAHK", "", "", 0.8220097), - ("AELGIPLEEVPPEEINYLTR", "", "", 0.8956433), - ("NESTPPSEELELDKWK", "", "", 0.4471560), - ("SIQEIQELDKDDESLR", "", "", 0.4157068), - ("EMEENFAVEAANYQDTIGR", "Oxidation@M", "1", 0.6388353), - ("MDSFDEDLARPSGLLAQER", "Oxidation@M", "0", 0.5593624), - ("SLLTEADAGHTEFTDEVYQNESR", "", "", 0.5538696), - ("NQDLAPNSAEQASILSLVTK", "", "", 0.7682227), - ("GKVEEVELPVEK", "", "", 0.2943246), - ("IYVASVHQDLSDDDIK", "", "", 0.3847130), - ("IKGDMDISVPK", "", "", 0.2844255), - ("IIPVLLEHGLER", "", "", 0.5619017), - ("AGYTDKVVIGMDVAASEFFR", "", "", 0.8972052), - ("TDYNASVSVPDSSGPER", "", "", 0.3279318), - ("DLKPQNLLINTEGAIK", "", "", 0.6046495), - ("VAEAIAASFGSFADFK", "", "", 0.8935943), - ("AMVSNAQLDNEK", "Oxidation@M", "1", 0.1724159), - ("THINIVVIGHVDSGK", "", "", 0.4865058), - ("LILPHVDIQLK", "", "", 0.6268850), - ("LIAPVAEEEATVPNNK", "", "", 0.4162872), - ("FTASAGIQVVGDDLTVTNPK", "", "", 0.7251064), - ("HEDLKDMLEFPAQELR", "", "", 0.6529368), - ("LLPDFLLER", "", "", 0.7852863), ]; - let peptides: Vec<&str> = test_peptides.iter().map(|(pep, _, _, _)| *pep).collect(); - let mods: Vec<&str> = test_peptides.iter().map(|(_, mod_, _, _)| *mod_).collect(); - let mod_sites: Vec<&str> = test_peptides.iter().map(|(_, _, sites, _)| *sites).collect(); + let peptides: Vec> = test_peptides + .iter() + .map(|(pep, _, _, _)| Arc::from(pep.as_bytes().to_vec().into_boxed_slice())) + .collect(); + let mods: Vec> = test_peptides + .iter() + .map(|(_, mod_, _, _)| Arc::from(mod_.as_bytes().to_vec().into_boxed_slice())) + .collect(); + let mod_sites: Vec> = test_peptides + .iter() + .map(|(_, _, sites, _)| Arc::from(sites.as_bytes().to_vec().into_boxed_slice())) + .collect(); let observed_rts: Vec = test_peptides.iter().map(|(_, _, _, rt)| *rt).collect(); match model.predict(&peptides, &mods, &mod_sites, None, None, None) { - Ok(predictions) => { - if let PredictionResult::RTResult(rt_preds) = predictions { - let total_error: f32 = rt_preds.iter().zip(observed_rts.iter()) - .map(|(pred, obs)| (pred - obs).abs()) - .sum(); - - let mut peptides_iter = peptides.iter(); - let mut rt_preds_iter = rt_preds.iter(); - let mut observed_rts_iter = observed_rts.iter(); - - loop { - match (peptides_iter.next(), rt_preds_iter.next(), observed_rts_iter.next()) { - (Some(pep), Some(pred), Some(obs)) => { - println!("Peptide: {}, Predicted RT: {}, Observed RT: {}", pep, pred, obs); - } - _ => break, - } - } - - let mean_absolute_error = total_error / rt_preds.len() as f32; - println!("Mean Absolute Error: {:.6}", mean_absolute_error); - } else { - println!("Unexpected prediction result type."); + Ok(PredictionResult::RTResult(rt_preds)) => { + let total_error: f32 = rt_preds + .iter() + .zip(observed_rts.iter()) + .map(|(pred, obs)| (pred - obs).abs()) + .sum(); + + for ((pep_bytes, pred), obs) in peptides + .iter() + .zip(rt_preds.iter()) + .zip(observed_rts.iter()) + { + let pep = std::str::from_utf8(pep_bytes).unwrap_or(""); + println!( + "Peptide: {}, Predicted RT: {}, Observed RT: {}", + pep, pred, obs + ); } + + let mean_absolute_error = total_error / rt_preds.len() as f32; + println!("Mean Absolute Error: {:.6}", mean_absolute_error); } - Err(e) => { - println!("Error during batch prediction: {:?}", e); - } + Ok(_) => println!("Unexpected prediction result type."), + Err(e) => println!("Error during batch prediction: {:?}", e), } } - } diff --git a/crates/redeem-properties/src/models/rt_cnn_transformer_model.rs b/crates/redeem-properties/src/models/rt_cnn_transformer_model.rs index 0231fe7..3743261 100644 --- a/crates/redeem-properties/src/models/rt_cnn_transformer_model.rs +++ b/crates/redeem-properties/src/models/rt_cnn_transformer_model.rs @@ -3,13 +3,14 @@ use candle_core::{DType, Device, IndexOp, Tensor}; use candle_nn::{Dropout, Module, VarBuilder, VarMap}; use std::collections::HashMap; use std::path::Path; +use std::sync::Arc; use crate::building_blocks::building_blocks::{ DecoderLinear, Encoder26aaModCnnTransformerAttnSum, MOD_FEATURE_SIZE, }; use crate::models::model_interface::{ModelInterface, PropertyType, load_tensors_from_model, create_var_map}; use crate::utils::peptdeep_utils::{ - load_mod_to_feature, + load_mod_to_feature_arc, parse_model_constants, ModelConstants, }; use crate::utils::utils::get_tensor_stats; @@ -24,7 +25,7 @@ pub struct RTCNNTFModel { varmap: VarMap, constants: ModelConstants, device: Device, - mod_to_feature: HashMap>, + mod_to_feature: HashMap, Vec>, dropout: Dropout, rt_encoder: Encoder26aaModCnnTransformerAttnSum, rt_decoder: DecoderLinear, @@ -66,7 +67,7 @@ impl ModelInterface for RTCNNTFModel { log::trace!("[RTCNNTFModel] Initializing rt_decoder"); let rt_decoder = DecoderLinear::new(128, 1, &varbuilder.pp("rt_decoder"))?; let constants = ModelConstants::default(); - let mod_to_feature = load_mod_to_feature(&constants)?; + let mod_to_feature = load_mod_to_feature_arc(&constants)?; Ok(Self { var_store: varbuilder, @@ -101,7 +102,7 @@ impl ModelInterface for RTCNNTFModel { None => ModelConstants::default(), }; - let mod_to_feature = load_mod_to_feature(&constants)?; + let mod_to_feature = load_mod_to_feature_arc(&constants)?; let dropout = Dropout::new(0.1); let rt_encoder = Encoder26aaModCnnTransformerAttnSum::from_varstore( @@ -195,7 +196,7 @@ impl ModelInterface for RTCNNTFModel { self.constants.mod_elements.len() } - fn get_mod_to_feature(&self) -> &HashMap> { + fn get_mod_to_feature(&self) -> &HashMap, Vec> { &self.mod_to_feature } diff --git a/crates/redeem-properties/src/models/rt_model.rs b/crates/redeem-properties/src/models/rt_model.rs index c41f056..0044252 100644 --- a/crates/redeem-properties/src/models/rt_model.rs +++ b/crates/redeem-properties/src/models/rt_model.rs @@ -1,22 +1,23 @@ // rt_model.rs -use std::path::Path; -use std::ops::Deref; -use candle_core::{Device, Tensor}; -use anyhow::{Result, anyhow}; -use candle_nn::VarMap; -use crate::models::model_interface::{ModelInterface,PredictionResult}; +use crate::models::model_interface::{ModelInterface, PredictionResult}; use crate::models::rt_cnn_lstm_model::RTCNNLSTMModel; use crate::models::rt_cnn_transformer_model::RTCNNTFModel; use crate::utils::data_handling::{PeptideData, RTNormalization}; +use crate::utils::peptdeep_utils::ModificationMap; use crate::utils::stats::TrainingStepMetrics; +use anyhow::{anyhow, Result}; +use candle_core::{Device, Tensor}; +use candle_nn::VarMap; use std::collections::HashMap; -use crate::utils::peptdeep_utils::ModificationMap; +use std::ops::Deref; +use std::path::Path; +use std::sync::Arc; // Enum for different types of retention time models pub enum RTModelArch { RTCNNLSTM, - RTCNNTF + RTCNNTF, } // Constants for different types of retention time models @@ -35,32 +36,97 @@ impl Clone for RTModelWrapper { } } - impl RTModelWrapper { - pub fn new>(model_path: P, constants_path: Option

, arch: &str, device: Device) -> Result { + pub fn new>( + model_path: P, + constants_path: Option

, + arch: &str, + device: Device, + ) -> Result { let model: Box = match arch { - "rt_cnn_lstm" => Box::new(RTCNNLSTMModel::new(model_path, constants_path, 0, 8, 4, true, device)?), - "rt_cnn_tf" => Box::new(RTCNNTFModel::new(model_path, constants_path, 0, 8, 4, true, device)?), + "rt_cnn_lstm" => Box::new(RTCNNLSTMModel::new( + model_path, + constants_path, + 0, + 8, + 4, + true, + device, + )?), + "rt_cnn_tf" => Box::new(RTCNNTFModel::new( + model_path, + constants_path, + 0, + 8, + 4, + true, + device, + )?), _ => return Err(anyhow!("Unsupported RT model architecture: {}", arch)), }; Ok(Self { model }) } - pub fn predict(&self, peptide_sequence: &Vec<&str>, mods: &Vec<&str>, mod_sites: &Vec<&str>) -> Result { - self.model.predict(peptide_sequence, mods, mod_sites, None, None, None) + pub fn predict( + &self, + peptide_sequence: &[Arc<[u8]>], + mods: &[Arc<[u8]>], + mod_sites: &[Arc<[u8]>], + ) -> Result { + self.model + .predict(peptide_sequence, mods, mod_sites, None, None, None) } - pub fn train(&mut self, training_data: &Vec, val_data: Option<&Vec>, modifications: HashMap<(String, Option), ModificationMap>, batch_size: usize, val_batch_size: usize, learning_rate: f64, epochs: usize, early_stopping_patience: usize) -> Result { - self.model.train(training_data, val_data, modifications, batch_size, val_batch_size, learning_rate, epochs, early_stopping_patience) + pub fn train( + &mut self, + training_data: &Vec, + val_data: Option<&Vec>, + modifications: HashMap<(String, Option), ModificationMap>, + batch_size: usize, + val_batch_size: usize, + learning_rate: f64, + epochs: usize, + early_stopping_patience: usize, + ) -> Result { + self.model.train( + training_data, + val_data, + modifications, + batch_size, + val_batch_size, + learning_rate, + epochs, + early_stopping_patience, + ) } - pub fn fine_tune(&mut self, training_data: &Vec, modifications: HashMap<(String, Option), ModificationMap>, batch_size:usize, learning_rate: f64, epochs: usize) -> Result<()> { - self.model.fine_tune(training_data, modifications, batch_size, learning_rate, epochs) + pub fn fine_tune( + &mut self, + training_data: &Vec, + modifications: HashMap<(String, Option), ModificationMap>, + batch_size: usize, + learning_rate: f64, + epochs: usize, + ) -> Result<()> { + self.model.fine_tune( + training_data, + modifications, + batch_size, + learning_rate, + epochs, + ) } - pub fn inference(&mut self, inference_data: &Vec, batch_size: usize, modifications: HashMap<(String, Option), ModificationMap>, rt_norm_params: RTNormalization,) -> Result> { - self.model.inference(inference_data, batch_size, modifications, rt_norm_params) + pub fn inference( + &mut self, + inference_data: &Vec, + batch_size: usize, + modifications: HashMap<(String, Option), ModificationMap>, + rt_norm_params: RTNormalization, + ) -> Result> { + self.model + .inference(inference_data, batch_size, modifications, rt_norm_params) } pub fn set_evaluation_mode(&mut self) { @@ -85,7 +151,12 @@ impl RTModelWrapper { } // Public API Function to load a new RT model -pub fn load_retention_time_model>(model_path: P, constants_path: Option

, arch: &str, device: Device) -> Result { +pub fn load_retention_time_model>( + model_path: P, + constants_path: Option

, + arch: &str, + device: Device, +) -> Result { RTModelWrapper::new(model_path, constants_path, arch, device) } @@ -101,7 +172,7 @@ pub fn load_retention_time_model>(model_path: P, constants_path: // let model_path = PathBuf::from("data/models/alphapeptdeep/generic/rt.pth"); // // let model_path = PathBuf::from("data/models/alphapeptdeep/generic/rt_resaved_model.pth"); // let constants_path = PathBuf::from("data/models/alphapeptdeep/generic/rt.pth.model_const.yaml"); - + // assert!( // model_path.exists(), // "\n╔══════════════════════════════════════════════════════════════════╗\n\ @@ -116,7 +187,7 @@ pub fn load_retention_time_model>(model_path: P, constants_path: // ╚══════════════════════════════════════════════════════════════════╝\n", // model_path // ); - + // assert!( // constants_path.exists(), // "\n╔══════════════════════════════════════════════════════════════════╗\n\ @@ -133,12 +204,12 @@ pub fn load_retention_time_model>(model_path: P, constants_path: // ); // let result = load_retention_time_model(&model_path, &constants_path, "rt_cnn_lstm", Device::Cpu); - + // assert!(result.is_ok(), "Failed to load model: {:?}", result.err()); // let mut model = result.unwrap(); // model.print_summary(); - + // // Print the model's weights // model.print_weights(); @@ -171,4 +242,4 @@ pub fn load_retention_time_model>(model_path: P, constants_path: // }, // } // } -// } \ No newline at end of file +// } diff --git a/crates/redeem-properties/src/utils/peptdeep_utils.rs b/crates/redeem-properties/src/utils/peptdeep_utils.rs index dfdfcc5..6a4757e 100644 --- a/crates/redeem-properties/src/utils/peptdeep_utils.rs +++ b/crates/redeem-properties/src/utils/peptdeep_utils.rs @@ -4,6 +4,7 @@ use std::ops::Index; use std::path::PathBuf; use std::io; use std::fs; +use std::sync::Arc; use log::info; use csv::ReaderBuilder; use reqwest; @@ -214,6 +215,33 @@ pub fn load_mod_to_feature(constants: &ModelConstants) -> Result Result, Vec>, Error> { + let path = ensure_mod_tsv_exists()?; + let mut rdr = ReaderBuilder::new() + .delimiter(b'\t') + .from_path(path)?; + + let mod_elem_to_idx: HashMap = constants + .mod_elements + .iter() + .enumerate() + .map(|(i, elem)| (elem.clone(), i)) + .collect(); + + let mod_feature_size = constants.mod_elements.len(); + let mut mod_to_feature = HashMap::new(); + + for result in rdr.deserialize() { + let record: ModFeature = result?; + let feature_vector = parse_mod_formula(&record.composition, &mod_elem_to_idx, mod_feature_size); + mod_to_feature.insert(Arc::from(record.mod_name.as_bytes()), feature_vector); + } + + Ok(mod_to_feature) +} + #[derive(Debug, Clone)] pub struct ModificationMap { From 905c80ac8eb81ef97600700854103f6c99114092 Mon Sep 17 00:00:00 2001 From: singjc Date: Tue, 13 May 2025 13:04:55 -0400 Subject: [PATCH 47/75] refactor: Improve error handling in redeem-cli crate --- crates/redeem-cli/src/main.rs | 29 +++++++++++++++---- .../src/properties/train/trainer.rs | 2 +- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/crates/redeem-cli/src/main.rs b/crates/redeem-cli/src/main.rs index d9ea4dd..6ce3d47 100644 --- a/crates/redeem-cli/src/main.rs +++ b/crates/redeem-cli/src/main.rs @@ -164,21 +164,38 @@ fn handle_properties(matches: &ArgMatches) -> Result<()> { Some(("train", train_matches)) => { let config_path: &PathBuf = train_matches.get_one("config").unwrap(); log::info!("[ReDeeM::Properties] Training from config: {:?}", config_path); - let params: PropertyTrainConfig = PropertyTrainConfig::from_arguments(config_path, train_matches)?; - let _ = trainer::run_training(¶ms); - Ok(()) + + let params: PropertyTrainConfig = + PropertyTrainConfig::from_arguments(config_path, train_matches)?; + + match trainer::run_training(¶ms) { + Ok(_) => Ok(()), + Err(e) => { + log::error!("Training failed: {:#}", e); + std::process::exit(1) + } + } }, Some(("inference", inference_matches)) => { let config_path: &PathBuf = inference_matches.get_one("config").unwrap(); log::info!("[ReDeeM::Properties] Inference using config: {:?}", config_path); - let params: PropertyInferenceConfig = PropertyInferenceConfig::from_arguments(config_path, inference_matches)?; - let _ = inference:: run_inference(¶ms); - Ok(()) + + let params: PropertyInferenceConfig = + PropertyInferenceConfig::from_arguments(config_path, inference_matches)?; + + match inference::run_inference(¶ms) { + Ok(_) => Ok(()), + Err(e) => { + log::error!("Inference failed: {:#}", e); + std::process::exit(1) + } + } } _ => unreachable!(), } } + fn handle_classifiers(matches: &ArgMatches) -> Result<()> { match matches.subcommand() { Some(("rescore", rescore_matches)) => { diff --git a/crates/redeem-cli/src/properties/train/trainer.rs b/crates/redeem-cli/src/properties/train/trainer.rs index 2c6c657..bbbec1c 100644 --- a/crates/redeem-cli/src/properties/train/trainer.rs +++ b/crates/redeem-cli/src/properties/train/trainer.rs @@ -144,7 +144,7 @@ pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { config.learning_rate as f64, config.epochs, config.early_stopping_patience, - )?; + ).with_context(|| "Training failed: an error occurred during the model training process")?; log::info!("Training completed in {:?}", start_time.elapsed()); model.save(&config.output_file)?; log::info!("Model saved to: {}", config.output_file); From 9e6d8c3ea1ada8180ce99290ba67d849d9ac7d6a Mon Sep 17 00:00:00 2001 From: singjc Date: Tue, 13 May 2025 21:06:30 -0400 Subject: [PATCH 48/75] refactor: Optimize contiguous operations in building_blocks.rs --- .../src/building_blocks/building_blocks.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/crates/redeem-properties/src/building_blocks/building_blocks.rs b/crates/redeem-properties/src/building_blocks/building_blocks.rs index 132d067..2e758fd 100644 --- a/crates/redeem-properties/src/building_blocks/building_blocks.rs +++ b/crates/redeem-properties/src/building_blocks/building_blocks.rs @@ -1056,11 +1056,11 @@ impl Encoder26aaModCnnTransformerAttnSum { } let x = self.input_cnn.forward(&x)?; - + let x = x.contiguous()?; let x = self.proj_cnn_to_transformer.forward(&x)?; - + let x = x.contiguous()?; let x = self.input_transformer.forward(&x)?; - + let x = x.contiguous()?; let x = self.attn_sum.forward(&x)?; Ok(x) @@ -1177,11 +1177,11 @@ impl Encoder26aaModChargeCnnTransformerAttnSum { log::trace!("[Encoder26aaModChargeCnnTransformerAttnSum] one-hot output stats - min: {min}, max: {max}, mean: {mean}"); let x = self.input_cnn.forward(&x)?; - + let x = x.contiguous()?; let x = self.proj_cnn_to_transformer.forward(&x)?; - + let x = x.contiguous()?; let x = self.input_transformer.forward(&x)?; - + let x = x.contiguous()?; let x = self.attn_sum.forward(&x)?; Ok(x) From a55ae3feabb83c8ff59b1cc3f296ef15c03d6a9f Mon Sep 17 00:00:00 2001 From: singjc Date: Tue, 13 May 2025 23:56:15 -0400 Subject: [PATCH 49/75] refactor: Update rank feature based on new classifier scores --- .../redeem-classifiers/src/data_handling.rs | 102 ++++++++++++++++-- crates/redeem-classifiers/src/psm_scorer.rs | 13 ++- 2 files changed, 104 insertions(+), 11 deletions(-) diff --git a/crates/redeem-classifiers/src/data_handling.rs b/crates/redeem-classifiers/src/data_handling.rs index 25d9067..aea785e 100644 --- a/crates/redeem-classifiers/src/data_handling.rs +++ b/crates/redeem-classifiers/src/data_handling.rs @@ -1,3 +1,6 @@ +use std::collections::HashMap; +use std::sync::Arc; + use ndarray::{Array1, Array2, ArrayView2, Axis}; use rand::rngs::StdRng; use rand::seq::SliceRandom; @@ -5,6 +8,16 @@ use rand::{thread_rng, SeedableRng}; use crate::stats::tdc; +#[derive(Debug, Clone)] +pub struct PsmMetadata { + /// Spectrum id + pub spec_id: Vec, + /// File identifier + pub file_id: Vec, + /// Feature names + pub feature_names: Vec, +} + #[derive(Debug, Clone)] pub struct Experiment { pub x: Array2, @@ -13,10 +26,11 @@ pub struct Experiment { pub is_top_peak: Array1, pub tg_num_id: Array1, pub classifier_score: Array1, + pub psm_metadata: PsmMetadata, } impl Experiment { - pub fn new(x: Array2, y: Array1) -> Self { + pub fn new(x: Array2, y: Array1, psm_metadata: PsmMetadata) -> Self { let n_samples = x.nrows(); Experiment { x, @@ -25,6 +39,7 @@ impl Experiment { is_top_peak: Array1::from_elem(n_samples, false), tg_num_id: Array1::from_elem(n_samples, 0), classifier_score: Array1::from_elem(n_samples, 0.0), + psm_metadata, } } @@ -71,6 +86,45 @@ impl Experiment { new_labels } + /// Update the "rank" feature column based on new classifier scores. + /// + /// This re-ranks all PSMs per spectrum (grouped by file_id and spec_id), + /// and sets the rank column in `self.x` accordingly (1 = best). + /// + /// # Arguments + /// * `scores` - The current classifier scores (same length as rows in `x`) + /// * `metadata` - PSM metadata with file_id and spec_id for grouping + pub fn update_rank_feature(&mut self, scores: &Array1, metadata: &PsmMetadata) { + // 1. Locate the "rank" feature index + let Some(rank_feature_idx) = metadata + .feature_names + .iter() + .position(|name| name == "rank") + else { + log::warn!("No 'rank' feature found in feature_names — skipping rank update."); + return; + }; + + // 2. Group PSMs by (file_id, spec_id) + let mut spectrum_groups: HashMap<(usize, &str), Vec<(usize, f32)>> = HashMap::new(); + for i in 0..self.x.nrows() { + spectrum_groups + .entry((metadata.file_id[i], metadata.spec_id[i].as_str())) + .or_default() + .push((i, scores[i])); + } + + // 3. For each group, sort by score descending and assign new rank + for group in spectrum_groups.values_mut() { + group.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)); + for (rank, (row_idx, _)) in group.iter().enumerate() { + self.x[[*row_idx, rank_feature_idx]] = (rank + 1) as f32; + } + } + + log::debug!("Updated rank feature for {} spectrum groups.", spectrum_groups.len()); + } + pub fn get_top_test_peaks(&self) -> Experiment { let mask = &self.is_train.mapv(|x| !x) & &self.is_top_peak; self.filter(&mask) @@ -103,16 +157,50 @@ impl Experiment { self.x.clone() } + /// Filter the experiment by applying a boolean mask to all row-aligned fields. + /// + /// This includes: + /// - Feature matrix `x` + /// - Labels `y` + /// - Training/test flags `is_train` + /// - Top peak flags `is_top_peak` + /// - Target group identifiers `tg_num_id` + /// - Classifier scores `classifier_score` + /// - PSM metadata: `spec_id`, `file_id` (feature names are retained as-is) + /// + /// # Arguments + /// + /// * `mask` - A boolean mask (`Array1`) of the same length as the number of PSMs (rows in `x`) + /// + /// # Returns + /// + /// A new `Experiment` instance with only rows where `mask[i] == true` pub fn filter(&self, mask: &Array1) -> Experiment { + let selected_indices: Vec = mask + .iter() + .enumerate() + .filter_map(|(i, &m)| if m { Some(i) } else { None }) + .collect(); + + fn filter_vec(v: &Vec, indices: &[usize]) -> Vec { + indices.iter().map(|&i| v[i].clone()).collect() + } + Experiment { - x: self.x.select(Axis(0), &mask.iter().enumerate().filter_map(|(i, &m)| if m { Some(i) } else { None }).collect::>()), - y: self.y.select(Axis(0), &mask.iter().enumerate().filter_map(|(i, &m)| if m { Some(i) } else { None }).collect::>()), - is_train: self.is_train.select(Axis(0), &mask.iter().enumerate().filter_map(|(i, &m)| if m { Some(i) } else { None }).collect::>()), - is_top_peak: self.is_top_peak.select(Axis(0), &mask.iter().enumerate().filter_map(|(i, &m)| if m { Some(i) } else { None }).collect::>()), - tg_num_id: self.tg_num_id.select(Axis(0), &mask.iter().enumerate().filter_map(|(i, &m)| if m { Some(i) } else { None }).collect::>()), - classifier_score: self.classifier_score.select(Axis(0), &mask.iter().enumerate().filter_map(|(i, &m)| if m { Some(i) } else { None }).collect::>()), + x: self.x.select(Axis(0), &selected_indices), + y: self.y.select(Axis(0), &selected_indices), + is_train: self.is_train.select(Axis(0), &selected_indices), + is_top_peak: self.is_top_peak.select(Axis(0), &selected_indices), + tg_num_id: self.tg_num_id.select(Axis(0), &selected_indices), + classifier_score: self.classifier_score.select(Axis(0), &selected_indices), + psm_metadata: PsmMetadata { + spec_id: filter_vec(&self.psm_metadata.spec_id, &selected_indices), + file_id: filter_vec(&self.psm_metadata.file_id, &selected_indices), + feature_names: self.psm_metadata.feature_names.clone(), // not row-aligned + }, } } + pub fn split_for_xval(&mut self, fraction: f32, is_test: bool) { let mut rng = thread_rng(); diff --git a/crates/redeem-classifiers/src/psm_scorer.rs b/crates/redeem-classifiers/src/psm_scorer.rs index 82475cb..f4bd9ba 100644 --- a/crates/redeem-classifiers/src/psm_scorer.rs +++ b/crates/redeem-classifiers/src/psm_scorer.rs @@ -5,7 +5,7 @@ use rand::seq::SliceRandom; use rand::thread_rng; use serde::{Deserialize, Serialize}; -use crate::data_handling::Experiment; +use crate::data_handling::{Experiment, PsmMetadata}; use crate::models::utils::{ModelParams, ModelType}; #[cfg(feature = "xgboost")] @@ -321,9 +321,9 @@ impl SemiSupervisedLearner { /// # Returns /// /// The predictions for the input features - pub fn fit(&mut self, x: Array2, y: Array1) -> Array1 { + pub fn fit(&mut self, x: Array2, y: Array1, psm_metadata: PsmMetadata) -> Array1 { - let mut experiment = Experiment::new(x.clone(), y.clone()); + let mut experiment = Experiment::new(x.clone(), y.clone(), psm_metadata.clone()); experiment.log_input_data_summary(); @@ -373,11 +373,14 @@ impl SemiSupervisedLearner { new_labels = experiment.update_labels(&all_predictions, self.train_fdr, best_desc); experiment.y = new_labels; + + experiment.update_rank_feature(&all_predictions, &experiment.psm_metadata.clone()); + } // Final prediction on the entire dataset log::info!("Final prediction on the entire dataset"); - let experiment = Experiment::new(x, y); + let experiment = Experiment::new(x, y, psm_metadata); // self.model // .fit(&experiment.x, &experiment.y.to_vec(), None, None); @@ -453,6 +456,7 @@ mod tests { } #[test] + #[cfg(feature = "xgboost")] fn test_xgb_semi_supervised_learner() { // Load the test data from the TSV files let x = read_features_tsv("/home/singjc/Documents/github/sage_bruker/20241115_single_file_redeem/sage_scores_for_testing.csv").unwrap(); @@ -485,6 +489,7 @@ mod tests { } #[test] + #[cfg(feature = "linfa")] fn test_svm_semi_supervised_learner() { // Load the test data from the TSV files let x = read_features_tsv("/home/singjc/Documents/github/sage_bruker/20241115_single_file_redeem/sage_scores_for_testing.csv").unwrap(); From 2c7e25e6126b2d8f78fdc426dafd98c6f3f32039 Mon Sep 17 00:00:00 2001 From: singjc Date: Wed, 14 May 2025 10:08:31 -0400 Subject: [PATCH 50/75] refactor: Update examples in classifiers crate --- .../examples/gbdt_semi_supervised_learning.rs | 104 +++++++----- .../examples/svm_semi_supervised_learning.rs | 153 ++++++++++------- .../examples/xgb_semi_supervised_learning.rs | 154 ++++++++++-------- 3 files changed, 247 insertions(+), 164 deletions(-) diff --git a/crates/redeem-classifiers/examples/gbdt_semi_supervised_learning.rs b/crates/redeem-classifiers/examples/gbdt_semi_supervised_learning.rs index d574709..d53f165 100644 --- a/crates/redeem-classifiers/examples/gbdt_semi_supervised_learning.rs +++ b/crates/redeem-classifiers/examples/gbdt_semi_supervised_learning.rs @@ -5,57 +5,84 @@ use ndarray::{Array1, Array2}; use std::error::Error; use std::fs::File; use std::io::Write; +use std::io::BufReader; +use redeem_classifiers::data_handling::PsmMetadata; use redeem_classifiers::psm_scorer::SemiSupervisedLearner; use redeem_classifiers::models::utils::ModelType; use redeem_classifiers::report::{report::{Report, ReportSection}, plots::{plot_score_histogram, plot_pp}}; -fn read_features_tsv(path: &str) -> Result, Box> { +/// Load a test PSM CSV file into feature matrix, labels, and metadata. +/// +/// # Arguments +/// * `path` - Path to the CSV file +/// +/// # Returns +/// A tuple of (`x`, `y`, `PsmMetadata`) +pub fn load_test_psm_csv(path: &str) -> Result<(Array2, Array1, PsmMetadata)> { + let file = File::open(path)?; let mut reader = ReaderBuilder::new() - .has_headers(false) - .delimiter(b',') - .from_path(path)?; - - let mut data = Vec::new(); + .has_headers(true) + .from_reader(BufReader::new(file)); + + let headers = reader + .headers()? + .iter() + .map(|h| h.to_string()) + .collect::>(); + + // Find indices + let file_id_idx = headers.iter().position(|h| h == "file_id").unwrap(); + let spec_id_idx = headers.iter().position(|h| h == "spec_id").unwrap(); + let label_idx = headers.iter().position(|h| h == "label").unwrap(); + + // Everything else is a feature + let feature_indices: Vec = (0..headers.len()) + .filter(|&i| i != file_id_idx && i != spec_id_idx && i != label_idx) + .collect(); + + let feature_names = feature_indices + .iter() + .map(|&i| headers[i].clone()) + .collect::>(); + + let mut file_ids = Vec::new(); + let mut spec_ids = Vec::new(); + let mut labels = Vec::new(); + let mut features = Vec::new(); for result in reader.records() { let record = result?; - let row: Vec = record + + file_ids.push(record[file_id_idx].parse::()?); + spec_ids.push(record[spec_id_idx].to_string()); + labels.push(record[label_idx].parse::()?); + + let row = feature_indices .iter() - .map(|field| field.parse::()) - .collect::>()?; - data.push(row); + .map(|&i| record[i].parse::().unwrap_or(f32::NAN)) + .collect::>(); + + features.extend(row); } - let n_samples = data.len(); - let n_features = data[0].len(); + let n_rows = labels.len(); + let n_cols = feature_indices.len(); - Array2::from_shape_vec( - (n_samples, n_features), - data.into_iter().flatten().collect(), - ) - .map_err(|e| e.into()) -} + let x = Array2::from_shape_vec((n_rows, n_cols), features)?; + let y = Array1::from_vec(labels); -fn read_labels_tsv(path: &str) -> Result, Box> { - let mut reader = ReaderBuilder::new() - .has_headers(false) - .delimiter(b'\t') - .from_path(path)?; - - let labels: Vec = reader - .records() - .map(|r| { - let record = r?; - let value = record.get(0).ok_or_else(|| "Empty row".to_string())?; - value.parse::().map_err(|e| e.into()) - }) - .collect::>>()?; - - Ok(Array1::from_vec(labels)) + let metadata = PsmMetadata { + file_id: file_ids, + spec_id: spec_ids, + feature_names, + }; + + Ok((x, y, metadata)) } + fn save_predictions_to_csv( predictions: &Array1, file_path: &str, @@ -71,12 +98,7 @@ fn save_predictions_to_csv( fn main() -> Result<()> { env_logger::init(); - // Load the test data from the TSV files - let x = read_features_tsv("/home/singjc/Documents/github/sage_bruker/20241115_single_file_redeem/sage_scores_for_testing.csv").unwrap(); - // Select first 10 columns of data - let x = x.slice(ndarray::s![.., ..10]).to_owned(); - - let y = read_labels_tsv("/home/singjc/Documents/github/sage_bruker/20241115_single_file_redeem/sage_labels_for_testing.csv").unwrap(); + let (x, y, metadata) = load_test_psm_csv("/home/singjc/Documents/github/sage_bruker/20241115_single_file_redeem/sage_scores_with_metadata_for_testing_redeem.csv")?; println!("Loaded features shape: {:?}", x.shape()); println!("Loaded labels shape: {:?}", y.shape()); @@ -97,7 +119,7 @@ fn main() -> Result<()> { 3, Some((0.15, 1.0)) ); - let predictions = learner.fit(x, y.clone()); + let predictions = learner.fit(x, y.clone(), metadata); println!("Labels: {:?}", y); diff --git a/crates/redeem-classifiers/examples/svm_semi_supervised_learning.rs b/crates/redeem-classifiers/examples/svm_semi_supervised_learning.rs index e9ad6b5..0b52a2c 100644 --- a/crates/redeem-classifiers/examples/svm_semi_supervised_learning.rs +++ b/crates/redeem-classifiers/examples/svm_semi_supervised_learning.rs @@ -1,84 +1,98 @@ -use anyhow::{Context, Result}; +use anyhow::{Context, Ok, Result}; use csv::ReaderBuilder; use ndarray::{Array1, Array2}; use std::error::Error; use std::fs::File; -use std::io::Write; +use std::io::{BufReader, Write}; +use redeem_classifiers::data_handling::PsmMetadata; use redeem_classifiers::psm_scorer::SemiSupervisedLearner; use redeem_classifiers::models::utils::ModelType; -fn read_features_tsv(path: &str) -> Result, Box> { +/// Load a test PSM CSV file into feature matrix, labels, and metadata. +/// +/// # Arguments +/// * `path` - Path to the CSV file +/// +/// # Returns +/// A tuple of (`x`, `y`, `PsmMetadata`) +pub fn load_test_psm_csv(path: &str) -> Result<(Array2, Array1, PsmMetadata)> { + let file = File::open(path)?; let mut reader = ReaderBuilder::new() - .has_headers(false) - .delimiter(b',') - .from_path(path)?; - - let mut data = Vec::new(); + .has_headers(true) + .from_reader(BufReader::new(file)); + + let headers = reader + .headers()? + .iter() + .map(|h| h.to_string()) + .collect::>(); + + // Find indices + let file_id_idx = headers.iter().position(|h| h == "file_id").unwrap(); + let spec_id_idx = headers.iter().position(|h| h == "spec_id").unwrap(); + let label_idx = headers.iter().position(|h| h == "label").unwrap(); + + // Everything else is a feature + let feature_indices: Vec = (0..headers.len()) + .filter(|&i| i != file_id_idx && i != spec_id_idx && i != label_idx) + .collect(); + + let feature_names = feature_indices + .iter() + .map(|&i| headers[i].clone()) + .collect::>(); + + let mut file_ids = Vec::new(); + let mut spec_ids = Vec::new(); + let mut labels = Vec::new(); + let mut features = Vec::new(); for result in reader.records() { let record = result?; - let row: Vec = record - .iter() - .map(|field| field.parse::()) - .collect::>()?; - data.push(row); - } - let n_samples = data.len(); - let n_features = data[0].len(); + file_ids.push(record[file_id_idx].parse::()?); + spec_ids.push(record[spec_id_idx].to_string()); + labels.push(record[label_idx].parse::()?); - Array2::from_shape_vec( - (n_samples, n_features), - data.into_iter().flatten().collect(), - ) - .map_err(|e| e.into()) -} + let row = feature_indices + .iter() + .map(|&i| record[i].parse::().unwrap_or(f32::NAN)) + .collect::>(); -fn read_labels_tsv(path: &str) -> Result, Box> { - let mut reader = ReaderBuilder::new() - .has_headers(false) - .delimiter(b'\t') - .from_path(path)?; - - let labels: Vec = reader - .records() - .map(|r| { - let record = r?; - let value = record.get(0).ok_or_else(|| "Empty row".to_string())?; - value.parse::().map_err(|e| e.into()) - }) - .collect::>>()?; - - Ok(Array1::from_vec(labels)) -} + features.extend(row); + } -fn save_predictions_to_csv( - predictions: &Array1, - file_path: &str, -) -> Result<(), Box> { - let mut file = File::create(file_path)?; + let n_rows = labels.len(); + let n_cols = feature_indices.len(); - for &pred in predictions.iter() { - writeln!(file, "{}", pred)?; - } + let x = Array2::from_shape_vec((n_rows, n_cols), features)?; + let y = Array1::from_vec(labels); - Ok(()) + let metadata = PsmMetadata { + file_id: file_ids, + spec_id: spec_ids, + feature_names, + }; + + Ok((x, y, metadata)) } -fn main() -> Result<()> { - env_logger::init(); - // Load the test data from the TSV files - let x = read_features_tsv("/home/singjc/Documents/github/sage_bruker/20241115_single_file_redeem/sage_scores_for_testing.csv").unwrap(); - let y = read_labels_tsv("/home/singjc/Documents/github/sage_bruker/20241115_single_file_redeem/sage_labels_for_testing.csv").unwrap(); +// fn save_predictions_to_csv( +// predictions: &Array1, +// file_path: &str, +// ) -> Result<(), Box> { +// let mut file = File::create(file_path)?; - // Select first 10 columns of data - let x = x.slice(ndarray::s![.., ..10]).to_owned(); +// for &pred in predictions.iter() { +// writeln!(file, "{}", pred)?; +// } - println!("Loaded features shape: {:?}", x.shape()); - println!("Loaded labels shape: {:?}", y.shape()); +// Ok(()) +// } - // Create and train your SemiSupervisedLearner +#[cfg(feature = "linfa")] +fn run_psm_scorer(x: &Array2, y: &Array1, metadata: &PsmMetadata) -> Result> { let params = ModelType::SVM { eps: 0.1, c: (1.0, 1.0), @@ -94,7 +108,28 @@ fn main() -> Result<()> { 500, Some((0.15, 1.0)) ); - let predictions = learner.fit(x, y.clone()); + let predictions = learner.fit(x, y.clone(), metadata); + Ok(predictions) +} + +#[cfg(not(feature = "linfa"))] +fn run_psm_scorer(x: &Array2, y: &Array1, metadata: &PsmMetadata) -> Result> { + unimplemented!("SVM is not available in this build. Please enable the linfa feature."); +} + +fn main() -> Result<()> { + env_logger::init(); + // Load the test data from the TSV files + let (x, y, metadata) = load_test_psm_csv("/home/singjc/Documents/github/sage_bruker/20241115_single_file_redeem/sage_scores_with_metadata_for_testing_redeem.csv")?; + + // Select first 10 columns of data + let x = x.slice(ndarray::s![.., ..10]).to_owned(); + + println!("Loaded features shape: {:?}", x.shape()); + println!("Loaded labels shape: {:?}", y.shape()); + + // Create and train your SemiSupervisedLearner + let predictions = run_psm_scorer(&x, &y, &metadata).context("Failed to run PSM scorer")?; println!("Labels: {:?}", y); diff --git a/crates/redeem-classifiers/examples/xgb_semi_supervised_learning.rs b/crates/redeem-classifiers/examples/xgb_semi_supervised_learning.rs index 6c4697f..66f80cb 100644 --- a/crates/redeem-classifiers/examples/xgb_semi_supervised_learning.rs +++ b/crates/redeem-classifiers/examples/xgb_semi_supervised_learning.rs @@ -1,58 +1,83 @@ use anyhow::{Context, Result}; use csv::ReaderBuilder; -use machine_info::Machine; use ndarray::{Array1, Array2}; + use std::error::Error; use std::fs::File; -use std::io::Write; +use std::io::{BufReader, Write}; use std::process; +use redeem_classifiers::data_handling::PsmMetadata; use redeem_classifiers::psm_scorer::SemiSupervisedLearner; use redeem_classifiers::models::utils::ModelType; -fn read_features_tsv(path: &str) -> Result, Box> { +/// Load a test PSM CSV file into feature matrix, labels, and metadata. +/// +/// # Arguments +/// * `path` - Path to the CSV file +/// +/// # Returns +/// A tuple of (`x`, `y`, `PsmMetadata`) +pub fn load_test_psm_csv(path: &str) -> Result<(Array2, Array1, PsmMetadata)> { + let file = File::open(path)?; let mut reader = ReaderBuilder::new() - .has_headers(false) - .delimiter(b',') - .from_path(path)?; - - let mut data = Vec::new(); + .has_headers(true) + .from_reader(BufReader::new(file)); + + let headers = reader + .headers()? + .iter() + .map(|h| h.to_string()) + .collect::>(); + + // Find indices + let file_id_idx = headers.iter().position(|h| h == "file_id").unwrap(); + let spec_id_idx = headers.iter().position(|h| h == "spec_id").unwrap(); + let label_idx = headers.iter().position(|h| h == "label").unwrap(); + + // Everything else is a feature + let feature_indices: Vec = (0..headers.len()) + .filter(|&i| i != file_id_idx && i != spec_id_idx && i != label_idx) + .collect(); + + let feature_names = feature_indices + .iter() + .map(|&i| headers[i].clone()) + .collect::>(); + + let mut file_ids = Vec::new(); + let mut spec_ids = Vec::new(); + let mut labels = Vec::new(); + let mut features = Vec::new(); for result in reader.records() { let record = result?; - let row: Vec = record + + file_ids.push(record[file_id_idx].parse::()?); + spec_ids.push(record[spec_id_idx].to_string()); + labels.push(record[label_idx].parse::()?); + + let row = feature_indices .iter() - .map(|field| field.parse::()) - .collect::>()?; - data.push(row); + .map(|&i| record[i].parse::().unwrap_or(f32::NAN)) + .collect::>(); + + features.extend(row); } - let n_samples = data.len(); - let n_features = data[0].len(); + let n_rows = labels.len(); + let n_cols = feature_indices.len(); - Array2::from_shape_vec( - (n_samples, n_features), - data.into_iter().flatten().collect(), - ) - .map_err(|e| e.into()) -} + let x = Array2::from_shape_vec((n_rows, n_cols), features)?; + let y = Array1::from_vec(labels); -fn read_labels_tsv(path: &str) -> Result, Box> { - let mut reader = ReaderBuilder::new() - .has_headers(false) - .delimiter(b'\t') - .from_path(path)?; - - let labels: Vec = reader - .records() - .map(|r| { - let record = r?; - let value = record.get(0).ok_or_else(|| "Empty row".to_string())?; - value.parse::().map_err(|e| e.into()) - }) - .collect::>>()?; - - Ok(Array1::from_vec(labels)) + let metadata = PsmMetadata { + file_id: file_ids, + spec_id: spec_ids, + feature_names, + }; + + Ok((x, y, metadata)) } fn save_predictions_to_csv( @@ -68,48 +93,49 @@ fn save_predictions_to_csv( Ok(()) } +#[cfg(feature = "xgboost")] +fn run_psm_scorer(x: &Array2, y: &Array1, metadata: &PsmMetadata) -> Result> { + // Create and train your SemiSupervisedLearner + + use std::fs::metadata; + let xgb_params = ModelType::XGBoost { + max_depth: 6, + num_boost_round: 100, + early_stopping_rounds: 10, + verbose_eval: false, + }; +let mut learner = SemiSupervisedLearner::new( + xgb_params, + 0.01, + 1.0, + 5, + Some((1.0, 1.0)) +); +let predictions = learner.fit(x, y.clone(), metadata); + Ok(predictions) +} + +#[cfg(not(feature = "xgboost"))] +fn run_psm_scorer(x: &Array2, y: &Array1, metadata: &PsmMetadata) -> Result> { + unimplemented!("xgboost is not available in this build. Please enable the xgboost feature."); +} + fn main() -> Result<()> { env_logger::init(); - - let mut m = Machine::new(); - m.track_process(process::id() as i32).unwrap(); // Load the test data from the TSV files - let x = read_features_tsv("/home/singjc/Documents/github/sage_bruker/20241115_single_file_redeem/sage_scores_for_testing.csv").unwrap(); - // Select first 10 columns of data - let x = x.slice(ndarray::s![.., ..10]).to_owned(); - - let y = read_labels_tsv("/home/singjc/Documents/github/sage_bruker/20241115_single_file_redeem/sage_labels_for_testing.csv").unwrap(); + let (x, y, metadata) = load_test_psm_csv("/home/singjc/Documents/github/sage_bruker/20241115_single_file_redeem/sage_scores_with_metadata_for_testing_redeem.csv")?; println!("Loaded features shape: {:?}", x.shape()); println!("Loaded labels shape: {:?}", y.shape()); - // Create and train your SemiSupervisedLearner - let xgb_params = ModelType::XGBoost { - max_depth: 6, - num_boost_round: 100, - early_stopping_rounds: 10, - verbose_eval: false, - }; - let mut learner = SemiSupervisedLearner::new( - xgb_params, - 0.01, - 1.0, - 5, - Some((1.0, 1.0)) - ); - let predictions = learner.fit(x, y.clone()); + let predictions = run_psm_scorer(&x, &y, &metadata).context("Failed to run PSM scorer")?; println!("Labels: {:?}", y); // Evaluate the predictions println!("Predictions: {:?}", predictions); - let processes = m.processes_status(); - let system = m.system_status(); - let graphics = m.graphics_status(); - println!("{:?} {:?} {:?}", processes, system, graphics); - // save_predictions_to_csv(&predictions, "/home/singjc/Documents/github/sage_bruker/20241115_single_file_redeem/predictions.csv").unwrap(); Ok(()) } \ No newline at end of file From 122d5c1b8225bbb3880d8b27ba28d0fc5149fa85 Mon Sep 17 00:00:00 2001 From: singjc Date: Wed, 14 May 2025 10:08:41 -0400 Subject: [PATCH 51/75] refactor: Update rank feature and log rank changes in Experiment class --- .../redeem-classifiers/src/data_handling.rs | 23 +++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/crates/redeem-classifiers/src/data_handling.rs b/crates/redeem-classifiers/src/data_handling.rs index aea785e..b28e9a5 100644 --- a/crates/redeem-classifiers/src/data_handling.rs +++ b/crates/redeem-classifiers/src/data_handling.rs @@ -91,6 +91,8 @@ impl Experiment { /// This re-ranks all PSMs per spectrum (grouped by file_id and spec_id), /// and sets the rank column in `self.x` accordingly (1 = best). /// + /// Also logs the percentage of PSMs whose rank changed. + /// /// # Arguments /// * `scores` - The current classifier scores (same length as rows in `x`) /// * `metadata` - PSM metadata with file_id and spec_id for grouping @@ -114,17 +116,34 @@ impl Experiment { .push((i, scores[i])); } + let mut changed_ranks = 0; + // 3. For each group, sort by score descending and assign new rank for group in spectrum_groups.values_mut() { group.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)); for (rank, (row_idx, _)) in group.iter().enumerate() { - self.x[[*row_idx, rank_feature_idx]] = (rank + 1) as f32; + let old_rank = self.x[[*row_idx, rank_feature_idx]] as usize; + let new_rank = rank + 1; + if old_rank != new_rank { + changed_ranks += 1; + } + self.x[[*row_idx, rank_feature_idx]] = new_rank as f32; } } - log::debug!("Updated rank feature for {} spectrum groups.", spectrum_groups.len()); + let total = self.x.nrows(); + let pct_changed = (changed_ranks as f64 / total as f64) * 100.0; + + log::debug!( + "Updated rank feature for {} spectrum groups. Rank changed for {:.2}% of PSMs ({} of {}).", + spectrum_groups.len(), + pct_changed, + changed_ranks, + total + ); } + pub fn get_top_test_peaks(&self) -> Experiment { let mask = &self.is_train.mapv(|x| !x) & &self.is_top_peak; self.filter(&mask) From 8f7eea0fa611e6070680d0e7a890c23d2949b1c1 Mon Sep 17 00:00:00 2001 From: singjc Date: Wed, 14 May 2025 10:39:22 -0400 Subject: [PATCH 52/75] refactor: Set log level to debug in main function --- .../examples/gbdt_semi_supervised_learning.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/crates/redeem-classifiers/examples/gbdt_semi_supervised_learning.rs b/crates/redeem-classifiers/examples/gbdt_semi_supervised_learning.rs index d53f165..e0debf9 100644 --- a/crates/redeem-classifiers/examples/gbdt_semi_supervised_learning.rs +++ b/crates/redeem-classifiers/examples/gbdt_semi_supervised_learning.rs @@ -98,6 +98,9 @@ fn save_predictions_to_csv( fn main() -> Result<()> { env_logger::init(); + // Set log level to debug + log::set_max_level(log::LevelFilter::Debug); + let (x, y, metadata) = load_test_psm_csv("/home/singjc/Documents/github/sage_bruker/20241115_single_file_redeem/sage_scores_with_metadata_for_testing_redeem.csv")?; println!("Loaded features shape: {:?}", x.shape()); From fb087943f3e6fc1cec9bb6daf6a56891cb7f542b Mon Sep 17 00:00:00 2001 From: singjc Date: Wed, 14 May 2025 12:41:31 -0400 Subject: [PATCH 53/75] refactor: Update loading of modifications to use byte slice instead of file path --- .../src/utils/peptdeep_utils.rs | 230 ++++++++---------- 1 file changed, 107 insertions(+), 123 deletions(-) diff --git a/crates/redeem-properties/src/utils/peptdeep_utils.rs b/crates/redeem-properties/src/utils/peptdeep_utils.rs index 6a4757e..d802348 100644 --- a/crates/redeem-properties/src/utils/peptdeep_utils.rs +++ b/crates/redeem-properties/src/utils/peptdeep_utils.rs @@ -12,9 +12,10 @@ use regex::Regex; use std::collections::HashMap; use serde::Deserialize; use zip::ZipArchive; +use once_cell::sync::Lazy; + +const MODIFICATIONS_TSV_BYTES: &[u8] = include_bytes!(concat!(env!("CARGO_MANIFEST_DIR"),"/assets/modification.tsv")); -const MOD_TSV_URL: &str = "https://raw.githubusercontent.com/MannLabs/alphabase/main/alphabase/constants/const_files/modification.tsv"; -const MOD_TSV_PATH: &str = "data/modification.tsv"; const PRETRAINED_MODELS_URL: &str = "https://github.com/singjc/redeem/releases/download/v0.1.0-alpha/peptdeep_generic_pretrained_models.zip"; const PRETRAINED_MODELS_ZIP: &str = "data/peptdeep_generic_pretrained_models.zip"; @@ -36,58 +37,59 @@ const MAX_INSTRUMENT_NUM: usize = 8; const UNKNOWN_INSTRUMENT_NUM: usize = MAX_INSTRUMENT_NUM - 1; -pub fn download_pretrained_models_exist() -> Result { - let zip_path = PathBuf::from(PRETRAINED_MODELS_ZIP); - let extract_dir = PathBuf::from(PRETRAINED_MODELS_PATH); +#[derive(Debug, Clone)] +pub struct ModificationMap { + pub name: String, + pub amino_acid: Option, // Optional if not applicable + pub unimod_id: Option +} - // Ensure the parent directory exists - if let Some(parent) = zip_path.parent() { - fs::create_dir_all(parent)?; - } +/// Loads a unified modification map where the key is either: +/// - ("57.0215", Some('C')) for mass-based lookup +/// - ("UniMod:4", Some('C')) for UniMod ID–based lookup +/// Loads the modification map, parsing the embedded modifications.tsv. +pub fn load_modifications() -> Result), ModificationMap>> { + let mut rdr = csv::ReaderBuilder::new() + .delimiter(b'\t') + .from_reader(MODIFICATIONS_TSV_BYTES); - // Download the zip file if it doesn't exist - if !zip_path.exists() { - info!("Downloading pretrained models..."); - let mut response = reqwest::blocking::get(PRETRAINED_MODELS_URL) - .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?; - let mut file = File::create(&zip_path)?; - io::copy(&mut response, &mut file)?; - } + let mut modifications = HashMap::new(); - // Unzip the file if the target directory doesn't exist - if !extract_dir.exists() { - info!("Unzipping pretrained models..."); - let file = File::open(&zip_path)?; - let mut archive = ZipArchive::new(file)?; + for result in rdr.records() { + let record = result?; + let mod_name = record.get(0).unwrap_or("").to_string(); + let unimod_mass: f64 = record.get(1).unwrap_or("0").parse().unwrap_or(0.0); + let unimod_id: Option = record.get(7).and_then(|s| s.parse().ok()); - for i in 0..archive.len() { - let mut file = archive.by_index(i)?; - let outpath = extract_dir.join(file.mangled_name()); + let mass_key = format!("{:.4}", unimod_mass); + let unimod_key = unimod_id.map(|id| format!("UniMod:{}", id)); - if file.name().ends_with('/') { - // Create directory - fs::create_dir_all(&outpath)?; - } else { - // Write file - if let Some(parent) = outpath.parent() { - fs::create_dir_all(parent)?; - } - let mut outfile = File::create(&outpath)?; - io::copy(&mut file, &mut outfile)?; - } + let amino_acid = mod_name.split('@').nth(1).and_then(|aa| aa.chars().next()); + + let modification = ModificationMap { + name: mod_name, + amino_acid, + unimod_id, + }; + + // Insert mass-based key + modifications.insert((mass_key.clone(), amino_acid), modification.clone()); + + // Insert unimod-id based key if available + if let Some(key) = unimod_key { + modifications.insert((key, amino_acid), modification.clone()); } } - Ok(extract_dir) + Ok(modifications) } -pub fn parse_instrument_index(instrument: &str) -> usize { - let upper_instrument = instrument.to_uppercase(); - - INSTRUMENT_DICT.iter() - .find(|&&(name, _)| name == upper_instrument) - .map_or(UNKNOWN_INSTRUMENT_NUM, |&(_, index)| index) -} +// Lazy static variable to hold the loaded modification map +pub static MODIFICATION_MAP: Lazy), ModificationMap>> = Lazy::new(|| { + load_modifications().expect("Failed to load modifications") +}); + + #[derive(Clone, Debug, Deserialize)] @@ -153,24 +155,6 @@ pub fn parse_model_constants(path: &str) -> Result { Ok(constants) } -fn ensure_mod_tsv_exists() -> Result { - let path = PathBuf::from(MOD_TSV_PATH); - - // Ensure the parent directory exists - if let Some(parent) = path.parent() { - fs::create_dir_all(parent)?; - } - - if !path.exists() { - info!("Downloading modification.tsv..."); - let mut response = reqwest::blocking::get(MOD_TSV_URL) - .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?; - let mut file = File::create(&path)?; - response.copy_to(&mut file) - .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?; - } - Ok(path) -} fn parse_mod_formula(formula: &str, mod_elem_to_idx: &HashMap, mod_feature_size: usize) -> Vec { let mut feature = vec![0.0; mod_feature_size]; @@ -191,10 +175,10 @@ fn parse_mod_formula(formula: &str, mod_elem_to_idx: &HashMap, mo } pub fn load_mod_to_feature(constants: &ModelConstants) -> Result>, Error> { - let path = ensure_mod_tsv_exists()?; + let mut rdr = ReaderBuilder::new() .delimiter(b'\t') - .from_path(path)?; + .from_reader(MODIFICATIONS_TSV_BYTES); // Read from the byte slice // Create mod_elem_to_idx mapping let mod_elem_to_idx: HashMap = constants.mod_elements.iter() @@ -218,10 +202,10 @@ pub fn load_mod_to_feature(constants: &ModelConstants) -> Result Result, Vec>, Error> { - let path = ensure_mod_tsv_exists()?; + let mut rdr = ReaderBuilder::new() .delimiter(b'\t') - .from_path(path)?; + .from_reader(MODIFICATIONS_TSV_BYTES); let mod_elem_to_idx: HashMap = constants .mod_elements @@ -243,59 +227,6 @@ pub fn load_mod_to_feature_arc( } -#[derive(Debug, Clone)] -pub struct ModificationMap { - pub name: String, - pub amino_acid: Option, // Optional if not applicable - pub unimod_id: Option -} - - -/// Loads a unified modification map where the key is either: -/// - ("57.0215", Some('C')) for mass-based lookup -/// - ("UniMod:4", Some('C')) for UniMod ID–based lookup -pub fn load_modifications() -> Result), ModificationMap>> { - let path: PathBuf = ensure_mod_tsv_exists().context("Failed to ensure TSV exists")?; - - let mut rdr = ReaderBuilder::new() - .delimiter(b'\t') - .from_path(&path) - .context("Failed to read modification TSV file")?; - - let mut modifications = HashMap::new(); - - for result in rdr.records() { - let record = result.context("Failed to read record")?; - let mod_name = record.get(0).unwrap_or("").to_string(); - let unimod_mass: f64 = record.get(1).unwrap_or("0").parse().unwrap_or(0.0); - let unimod_id: Option = record.get(7).and_then(|s| s.parse().ok()); - - let mass_key = format!("{:.4}", unimod_mass); - let unimod_key = unimod_id.map(|id| format!("UniMod:{}", id)); - - let amino_acid = mod_name.split('@').nth(1).and_then(|aa| aa.chars().next()); - - let modification = ModificationMap { - name: mod_name, - amino_acid, - unimod_id, - }; - - // Insert mass-based key - modifications.insert((mass_key.clone(), amino_acid), modification.clone()); - - // Insert unimod-id based key if available - if let Some(key) = unimod_key { - modifications.insert((key, amino_acid), modification.clone()); - } - } - - Ok(modifications) -} - - - - /// Removes mass shifts and UniMod annotations from a modified peptide sequence. /// /// Supports both bracketed mass shifts (e.g., `[+57.0215]`) and UniMod-style @@ -595,6 +526,59 @@ pub fn get_modification_string( } +pub fn download_pretrained_models_exist() -> Result { + let zip_path = PathBuf::from(PRETRAINED_MODELS_ZIP); + let extract_dir = PathBuf::from(PRETRAINED_MODELS_PATH); + + // Ensure the parent directory exists + if let Some(parent) = zip_path.parent() { + fs::create_dir_all(parent)?; + } + + // Download the zip file if it doesn't exist + if !zip_path.exists() { + info!("Downloading pretrained models..."); + let mut response = reqwest::blocking::get(PRETRAINED_MODELS_URL) + .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?; + let mut file = File::create(&zip_path)?; + io::copy(&mut response, &mut file)?; + } + + // Unzip the file if the target directory doesn't exist + if !extract_dir.exists() { + info!("Unzipping pretrained models..."); + let file = File::open(&zip_path)?; + let mut archive = ZipArchive::new(file)?; + + for i in 0..archive.len() { + let mut file = archive.by_index(i)?; + let outpath = extract_dir.join(file.mangled_name()); + + if file.name().ends_with('/') { + // Create directory + fs::create_dir_all(&outpath)?; + } else { + // Write file + if let Some(parent) = outpath.parent() { + fs::create_dir_all(parent)?; + } + let mut outfile = File::create(&outpath)?; + io::copy(&mut file, &mut outfile)?; + } + } + } + + Ok(extract_dir) +} + +pub fn parse_instrument_index(instrument: &str) -> usize { + let upper_instrument = instrument.to_uppercase(); + + INSTRUMENT_DICT.iter() + .find(|&&(name, _)| name == upper_instrument) + .map_or(UNKNOWN_INSTRUMENT_NUM, |&(_, index)| index) +} + // TODO: Derive from PeptDep constants yaml @@ -758,7 +742,7 @@ mod tests { #[test] fn test_get_modification_string() { - let modification_map = load_modifications().unwrap(); + let modification_map = MODIFICATION_MAP.clone(); let test_cases = vec![ ("PEPTIDE", ""), @@ -766,11 +750,11 @@ mod tests { ("P[+15.9949]EPT[+79.9663]IDE", "Oxidation@P;Phospho@T"), ("TVQSLEIDLDSM[+15.9949]R", "Oxidation@M"), ("TVQS[+79.9663]LEIDLDSM[+15.9949]R", "Phospho@S;Oxidation@M"), - ("(UniMod:1)M(UniMod:35)AAAATMAAAAR", "Any_N-term;Oxidation@M"), - ("[+42.0106]PEPTIDE", "Any_N-term"), + ("(UniMod:1)M(UniMod:35)AAAATMAAAAR", "Acetyl@Protein_N-term;Oxidation@M"), + ("[+42.0106]PEPTIDE", "Acetyl@Protein_N-term"), ("PEPTIDE[+42.0106]", ""), ("P[+15.9949]EP[+79.9663]T[+15.9949]IDE", "Oxidation@P;Oxidation@T"), - ("(UniMod:1)M(UniMod:35)AAAATMAAAAR", "Any_N-term;Oxidation@M"), + ("(UniMod:1)M(UniMod:35)AAAATMAAAAR", "Acetyl@Protein_N-term;Oxidation@M"), ]; From ddb255c0ee57580f8fd3b7e73bc2bd28307f988c Mon Sep 17 00:00:00 2001 From: singjc Date: Wed, 14 May 2025 12:41:55 -0400 Subject: [PATCH 54/75] refactor: Update data handling to use TargetNormalization instead of RTNormalization --- .../src/properties/inference/inference.rs | 2 +- crates/redeem-cli/src/properties/load_data.rs | 62 ++++++++++++------- .../src/properties/train/trainer.rs | 8 +-- .../src/models/model_interface.rs | 10 +-- .../redeem-properties/src/models/rt_model.rs | 4 +- .../src/utils/data_handling.rs | 11 ++-- 6 files changed, 56 insertions(+), 41 deletions(-) diff --git a/crates/redeem-cli/src/properties/inference/inference.rs b/crates/redeem-cli/src/properties/inference/inference.rs index 5fc6a91..6be1dea 100644 --- a/crates/redeem-cli/src/properties/inference/inference.rs +++ b/crates/redeem-cli/src/properties/inference/inference.rs @@ -5,7 +5,7 @@ use redeem_properties::models::ccs_model::load_collision_cross_section_model; use redeem_properties::models::model_interface::ModelInterface; use redeem_properties::models::rt_cnn_lstm_model::RTCNNLSTMModel; use redeem_properties::models::rt_model::load_retention_time_model; -use redeem_properties::utils::data_handling::{PeptideData, RTNormalization}; +use redeem_properties::utils::data_handling::{PeptideData, TargetNormalization}; use redeem_properties::utils::peptdeep_utils::load_modifications; use redeem_properties::utils::utils::get_device; diff --git a/crates/redeem-cli/src/properties/load_data.rs b/crates/redeem-cli/src/properties/load_data.rs index eac4717..9b5b746 100644 --- a/crates/redeem-cli/src/properties/load_data.rs +++ b/crates/redeem-cli/src/properties/load_data.rs @@ -5,7 +5,7 @@ use std::io::BufReader; use anyhow::{Result, Context}; use csv::ReaderBuilder; use redeem_properties::utils::peptdeep_utils::{get_modification_indices, get_modification_string, ModificationMap}; -use redeem_properties::utils::{data_handling::{PeptideData, RTNormalization}, peptdeep_utils::remove_mass_shift}; +use redeem_properties::utils::{data_handling::{PeptideData, TargetNormalization}, peptdeep_utils::remove_mass_shift}; @@ -17,9 +17,9 @@ pub fn load_peptide_data>( model_arch: &str, nce: Option, instrument: Option, - normalize_rt: Option, + normalize_target: Option, modifications: &HashMap<(String, Option), ModificationMap>, -) -> Result<(Vec, RTNormalization)> { +) -> Result<(Vec, TargetNormalization)> { let file = File::open(&path) .with_context(|| format!("Failed to open file: {:?}", path.as_ref()))?; let reader = BufReader::new(file); @@ -34,7 +34,13 @@ pub fn load_peptide_data>( let headers = rdr.headers()?.clone(); let mut peptides = Vec::new(); - let mut rt_values = Vec::new(); + let mut target_values = Vec::new(); + + let normalize_field = if model_arch.contains("ccs") { + "ccs" + } else { + "retention time" + }; for result in rdr.records() { let record = result?; @@ -51,7 +57,6 @@ pub fn load_peptide_data>( let sequence_str = String::from_utf8_lossy(&sequence_bytes); let naked_sequence = Arc::from(remove_mass_shift(&sequence_str).as_bytes().to_vec().into_boxed_slice()); - let mods: Arc<[u8]> = Arc::from(get_modification_string(&sequence_str, modifications).into_bytes().into_boxed_slice()); let mod_sites: Arc<[u8]> = Arc::from(get_modification_indices(&sequence_str).into_bytes().into_boxed_slice()); @@ -98,10 +103,12 @@ pub fn load_peptide_data>( }), _ => None, }; - - if let Some(rt) = retention_time { - rt_values.push(rt); + if let Some(val) = match normalize_field { + "ccs" => ccs, + _ => retention_time, + } { + target_values.push(val); } peptides.push(PeptideData { @@ -120,29 +127,38 @@ pub fn load_peptide_data>( }); } - match RTNormalization::from_str(normalize_rt) { - RTNormalization::ZScore(_, _) if !rt_values.is_empty() => { - let mean = rt_values.iter().copied().sum::() / rt_values.len() as f32; - let std = (rt_values.iter().map(|v| (v - mean).powi(2)).sum::() / rt_values.len() as f32).sqrt(); + match TargetNormalization::from_str(normalize_target) { + TargetNormalization::ZScore(_, _) if !target_values.is_empty() => { + let mean = target_values.iter().copied().sum::() / target_values.len() as f32; + let std = (target_values.iter().map(|v| (v - mean).powi(2)).sum::() / target_values.len() as f32).sqrt(); for peptide in &mut peptides { - if let Some(rt) = peptide.retention_time.as_mut() { - *rt = (*rt - mean) / std; + match normalize_field { + "ccs" => if let Some(val) = peptide.ccs.as_mut() { + *val = (*val - mean) / std; + }, + _ => if let Some(val) = peptide.retention_time.as_mut() { + *val = (*val - mean) / std; + }, } } - Ok((peptides, RTNormalization::ZScore(mean, std))) + Ok((peptides, TargetNormalization::ZScore(mean, std))) } - RTNormalization::MinMax(_, _) if !rt_values.is_empty() => { - let min = *rt_values.iter().min_by(|a, b| a.partial_cmp(b).unwrap()).unwrap(); - let max = *rt_values.iter().max_by(|a, b| a.partial_cmp(b).unwrap()).unwrap(); + TargetNormalization::MinMax(_, _) if !target_values.is_empty() => { + let min = *target_values.iter().min_by(|a, b| a.partial_cmp(b).unwrap()).unwrap(); + let max = *target_values.iter().max_by(|a, b| a.partial_cmp(b).unwrap()).unwrap(); let range = max - min; for peptide in &mut peptides { - if let Some(rt) = peptide.retention_time.as_mut() { - *rt = (*rt - min) / range; + match normalize_field { + "ccs" => if let Some(val) = peptide.ccs.as_mut() { + *val = (*val - min) / range; + }, + _ => if let Some(val) = peptide.retention_time.as_mut() { + *val = (*val - min) / range; + }, } } - Ok((peptides, RTNormalization::MinMax(min, max))) + Ok((peptides, TargetNormalization::MinMax(min, max))) } - _ => Ok((peptides, RTNormalization::None)) + _ => Ok((peptides, TargetNormalization::None)), } } - diff --git a/crates/redeem-cli/src/properties/train/trainer.rs b/crates/redeem-cli/src/properties/train/trainer.rs index bbbec1c..701bf8e 100644 --- a/crates/redeem-cli/src/properties/train/trainer.rs +++ b/crates/redeem-cli/src/properties/train/trainer.rs @@ -6,7 +6,7 @@ use redeem_properties::models::{ ccs_cnn_lstm_model::CCSCNNLSTMModel, ccs_cnn_tf_model::CCSCNNTFModel, rt_cnn_lstm_model::RTCNNLSTMModel, rt_cnn_transformer_model::RTCNNTFModel, }; -use redeem_properties::utils::data_handling::{PeptideData, RTNormalization}; +use redeem_properties::utils::data_handling::{PeptideData, TargetNormalization}; use redeem_properties::utils::peptdeep_utils::load_modifications; use redeem_properties::utils::utils::get_device; use report_builder::{ @@ -210,9 +210,9 @@ pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { match (true_pep.retention_time, pred_pep.retention_time) { (Some(t), Some(p)) => { let t_denorm = match norm_factor { - RTNormalization::ZScore(mean, std) => t as f64 * std as f64 + mean as f64, - RTNormalization::MinMax(min, range) => t as f64 * range as f64 + min as f64, - RTNormalization::None => t as f64, + TargetNormalization::ZScore(mean, std) => t as f64 * std as f64 + mean as f64, + TargetNormalization::MinMax(min, range) => t as f64 * range as f64 + min as f64, + TargetNormalization::None => t as f64, }; Some((t_denorm, p as f64)) } diff --git a/crates/redeem-properties/src/models/model_interface.rs b/crates/redeem-properties/src/models/model_interface.rs index 2353c44..eec347a 100644 --- a/crates/redeem-properties/src/models/model_interface.rs +++ b/crates/redeem-properties/src/models/model_interface.rs @@ -5,7 +5,7 @@ use crate::{ }, models::{ccs_model::CCSModelWrapper, ms2_model::MS2ModelWrapper, rt_model::RTModelWrapper}, utils::{ - data_handling::{PeptideBatchData, PeptideData, RTNormalization}, + data_handling::{PeptideBatchData, PeptideData, TargetNormalization}, logging::Progress, peptdeep_utils::{ get_modification_indices, get_modification_string, parse_instrument_index, @@ -910,7 +910,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { (String, Option), crate::utils::peptdeep_utils::ModificationMap, >, - rt_norm: RTNormalization, + rt_norm: TargetNormalization, ) -> Result> { let num_batches = (inference_data.len() + batch_size - 1) / batch_size; info!( @@ -967,11 +967,11 @@ pub trait ModelInterface: Send + Sync + ModelClone { match self.property_type() { PropertyType::RT => { peptide.retention_time = Some(match rt_norm { - RTNormalization::ZScore(mean, std) => pred * std + mean, - RTNormalization::MinMax(min, max) => { + TargetNormalization::ZScore(mean, std) => pred * std + mean, + TargetNormalization::MinMax(min, max) => { pred * (max - min) + min } - RTNormalization::None => pred, + TargetNormalization::None => pred, }); } PropertyType::CCS => peptide.ion_mobility = Some(pred), diff --git a/crates/redeem-properties/src/models/rt_model.rs b/crates/redeem-properties/src/models/rt_model.rs index 0044252..7b2a166 100644 --- a/crates/redeem-properties/src/models/rt_model.rs +++ b/crates/redeem-properties/src/models/rt_model.rs @@ -3,7 +3,7 @@ use crate::models::model_interface::{ModelInterface, PredictionResult}; use crate::models::rt_cnn_lstm_model::RTCNNLSTMModel; use crate::models::rt_cnn_transformer_model::RTCNNTFModel; -use crate::utils::data_handling::{PeptideData, RTNormalization}; +use crate::utils::data_handling::{PeptideData, TargetNormalization}; use crate::utils::peptdeep_utils::ModificationMap; use crate::utils::stats::TrainingStepMetrics; use anyhow::{anyhow, Result}; @@ -123,7 +123,7 @@ impl RTModelWrapper { inference_data: &Vec, batch_size: usize, modifications: HashMap<(String, Option), ModificationMap>, - rt_norm_params: RTNormalization, + rt_norm_params: TargetNormalization, ) -> Result> { self.model .inference(inference_data, batch_size, modifications, rt_norm_params) diff --git a/crates/redeem-properties/src/utils/data_handling.rs b/crates/redeem-properties/src/utils/data_handling.rs index 035238d..4a7dc83 100644 --- a/crates/redeem-properties/src/utils/data_handling.rs +++ b/crates/redeem-properties/src/utils/data_handling.rs @@ -1,20 +1,19 @@ use std::sync::Arc; -/// Type of RT normalization used #[derive(Debug, Clone, Copy)] -pub enum RTNormalization { +pub enum TargetNormalization { ZScore(f32, f32), // mean, std MinMax(f32, f32), // min, max None, } -impl RTNormalization { +impl TargetNormalization { pub fn from_str(norm: Option) -> Self { match norm.as_deref() { - Some("z_score") => RTNormalization::ZScore(0.0, 0.0), - Some("min_max") => RTNormalization::MinMax(0.0, 0.0), - _ => RTNormalization::None, + Some("z_score") => TargetNormalization::ZScore(0.0, 0.0), + Some("min_max") => TargetNormalization::MinMax(0.0, 0.0), + _ => TargetNormalization::None, } } } From 14ff5b6c286b0748a514492bee4aa34b077e1310 Mon Sep 17 00:00:00 2001 From: singjc Date: Wed, 14 May 2025 12:42:03 -0400 Subject: [PATCH 55/75] refactor: Add once_cell dependency for redeem-properties crate --- crates/redeem-properties/Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/redeem-properties/Cargo.toml b/crates/redeem-properties/Cargo.toml index 56ef10b..e29ba46 100644 --- a/crates/redeem-properties/Cargo.toml +++ b/crates/redeem-properties/Cargo.toml @@ -14,6 +14,7 @@ env_logger = "0.8.4" log = "0.4.0" serde = { version = "1.0", features = ["derive"] } serde_yaml = "0.9" +once_cell = "1.8" ndarray = "0.15" #ndarray = "0.16.1" reqwest = { version = "0.11", features = ["blocking"] } From 782d9c38a93c4643371ad831eb4b397548b08162 Mon Sep 17 00:00:00 2001 From: singjc Date: Wed, 14 May 2025 12:45:55 -0400 Subject: [PATCH 56/75] add: modification.tsv asset --- .../redeem-properties/assets/modification.tsv | 2797 +++++++++++++++++ 1 file changed, 2797 insertions(+) create mode 100644 crates/redeem-properties/assets/modification.tsv diff --git a/crates/redeem-properties/assets/modification.tsv b/crates/redeem-properties/assets/modification.tsv new file mode 100644 index 0000000..47c455f --- /dev/null +++ b/crates/redeem-properties/assets/modification.tsv @@ -0,0 +1,2797 @@ +mod_name unimod_mass unimod_avge_mass composition unimod_modloss modloss_composition classification unimod_id smiles modloss_importance +Acetyl@T 42.010565 42.0367 H(2)C(2)O(1) 0.0 Post-translational 1 0.0 +Acetyl@Protein_N-term 42.010565 42.0367 H(2)C(2)O(1) 0.0 Post-translational 1 C(=O)C 0.0 +Acetyl@S 42.010565 42.0367 H(2)C(2)O(1) 0.0 Post-translational 1 0.0 +Acetyl@C 42.010565 42.0367 H(2)C(2)O(1) 0.0 Post-translational 1 0.0 +Acetyl@Any_N-term 42.010565 42.0367 H(2)C(2)O(1) 0.0 Multiple 1 C(=O)C 0.0 +Acetyl@K 42.010565 42.0367 H(2)C(2)O(1) 0.0 Multiple 1 CC(=O)NCCCC[C@H](N([Xe])([Xe]))C(=O)[Rn] 0.0 +Acetyl@Y 42.010565 42.0367 H(2)C(2)O(1) 0.0 Chemical derivative 1 0.0 +Acetyl@H 42.010565 42.0367 H(2)C(2)O(1) 0.0 Chemical derivative 1 0.0 +Acetyl@R 42.010565 42.0367 H(2)C(2)O(1) 0.0 Artefact 1 0.0 +Amidated@Any_C-term -0.984016 -0.9848 H(1)N(1)O(-1) 0.0 Artefact 2 N 0.0 +Amidated@Protein_C-term -0.984016 -0.9848 H(1)N(1)O(-1) 0.0 Post-translational 2 N 0.0 +Biotin@Any_N-term 226.077598 226.2954 H(14)C(10)N(2)O(2)S(1) 0.0 Chemical derivative 3 C(=O)CCCCC1SCC2NC(=O)NC21 0.0 +Biotin@K 226.077598 226.2954 H(14)C(10)N(2)O(2)S(1) 0.0 Post-translational 3 0.0 +Carbamidomethyl@Y 57.021464 57.0513 H(3)C(2)N(1)O(1) 0.0 Artefact 4 0.0 +Carbamidomethyl@T 57.021464 57.0513 H(3)C(2)N(1)O(1) 0.0 Artefact 4 0.0 +Carbamidomethyl@S 57.021464 57.0513 H(3)C(2)N(1)O(1) 0.0 Artefact 4 0.0 +Carbamidomethyl@E 57.021464 57.0513 H(3)C(2)N(1)O(1) 0.0 Artefact 4 0.0 +Carbamidomethyl@D 57.021464 57.0513 H(3)C(2)N(1)O(1) 0.0 Artefact 4 0.0 +Carbamidomethyl@H 57.021464 57.0513 H(3)C(2)N(1)O(1) 0.0 Artefact 4 0.0 +Carbamidomethyl@Any_N-term 57.021464 57.0513 H(3)C(2)N(1)O(1) 0.0 Artefact 4 C(=O)NC 0.0 +Carbamidomethyl@K 57.021464 57.0513 H(3)C(2)N(1)O(1) 0.0 Artefact 4 0.0 +Carbamidomethyl@C 57.021464 57.0513 H(3)C(2)N(1)O(1) 0.0 Chemical derivative 4 C(C(C(=O)[Rn])N([Xe])([Xe]))SCC(=O)N 0.0 +Carbamidomethyl@U 57.021464 57.0513 H(3)C(2)N(1)O(1) 0.0 Chemical derivative 4 0.0 +Carbamidomethyl@M 57.021464 57.0513 H(3)C(2)N(1)O(1) 105.024835 H(7)C(3)N(1)O(1)S(1) Chemical derivative 4 CS(CCC(N([Xe])([Xe]))C([Rn])=O)=CC(N)=O 0.5 +Carbamyl@Y 43.005814 43.0247 H(1)C(1)N(1)O(1) 0.0 Chemical derivative 5 0.0 +Carbamyl@T 43.005814 43.0247 H(1)C(1)N(1)O(1) 0.0 Chemical derivative 5 0.0 +Carbamyl@S 43.005814 43.0247 H(1)C(1)N(1)O(1) 0.0 Chemical derivative 5 0.0 +Carbamyl@M 43.005814 43.0247 H(1)C(1)N(1)O(1) 0.0 Artefact 5 0.0 +Carbamyl@C 43.005814 43.0247 H(1)C(1)N(1)O(1) 0.0 Artefact 5 0.0 +Carbamyl@R 43.005814 43.0247 H(1)C(1)N(1)O(1) 0.0 Artefact 5 0.0 +Carbamyl@Any_N-term 43.005814 43.0247 H(1)C(1)N(1)O(1) 0.0 Multiple 5 C(=O)N 0.0 +Carbamyl@K 43.005814 43.0247 H(1)C(1)N(1)O(1) 0.0 Multiple 5 0.0 +Carbamyl@Protein_N-term 43.005814 43.0247 H(1)C(1)N(1)O(1) 0.0 Post-translational 5 C(=O)N 0.0 +Carboxymethyl@Any_N-term 58.005479 58.0361 H(2)C(2)O(2) 0.0 Artefact 6 0.0 +Carboxymethyl@K 58.005479 58.0361 H(2)C(2)O(2) 0.0 Artefact 6 0.0 +Carboxymethyl@C 58.005479 58.0361 H(2)C(2)O(2) 0.0 Chemical derivative 6 0.0 +Carboxymethyl@W 58.005479 58.0361 H(2)C(2)O(2) 0.0 Chemical derivative 6 0.0 +Carboxymethyl@U 58.005479 58.0361 H(2)C(2)O(2) 0.0 Chemical derivative 6 0.0 +Deamidated@Q 0.984016 0.9848 H(-1)N(-1)O(1) 0.0 Artefact 7 C(CC(=O)O)[C@@H](C(=O)[Rn])N([Xe])([Xe]) 0.0 +Deamidated@R 0.984016 0.9848 H(-1)N(-1)O(1) 43.005814 H(1)C(1)N(1)O(1) Post-translational 7 0.5 +Deamidated@N 0.984016 0.9848 H(-1)N(-1)O(1) 0.0 Artefact 7 C([C@@H](C(=O)[Rn])N([Xe])([Xe]))C(=O)O 0.0 +Deamidated@F^Protein_N-term 0.984016 0.9848 H(-1)N(-1)O(1) 0.0 Post-translational 7 0.0 +ICAT-G@C 486.251206 486.6253 H(38)C(22)N(4)O(6)S(1) 0.0 Isotopic label 8 0.0 +ICAT-G:2H(8)@C 494.30142 494.6746 H(30)2H(8)C(22)N(4)O(6)S(1) 0.0 Isotopic label 9 0.0 +Met->Hse@M^Any_C-term -29.992806 -30.0922 H(-2)C(-1)O(1)S(-1) 0.0 Chemical derivative 10 N([Xe])([Xe])[C@H](C(=O)[Rn])CCO 0.0 +Met->Hsl@M^Any_C-term -48.003371 -48.1075 H(-4)C(-1)S(-1) 0.0 Chemical derivative 11 0.0 +ICAT-D:2H(8)@C 450.275205 450.6221 H(26)2H(8)C(20)N(4)O(5)S(1) 0.0 Isotopic label 12 0.0 +ICAT-D@C 442.224991 442.5728 H(34)C(20)N(4)O(5)S(1) 0.0 Isotopic label 13 0.0 +NIPCAM@C 99.068414 99.1311 H(9)C(5)N(1)O(1) 0.0 Chemical derivative 17 0.0 +PEO-Iodoacetyl-LC-Biotin@C 414.193691 414.5196 H(30)C(18)N(4)O(5)S(1) 0.0 Chemical derivative 20 0.0 +Phospho@E 79.966331 79.9799 H(1)O(3)P(1) 0.0 Post-translational 21 0.0 +Phospho@R 79.966331 79.9799 H(1)O(3)P(1) 0.0 Post-translational 21 0.0 +Phospho@K 79.966331 79.9799 H(1)O(3)P(1) 0.0 Post-translational 21 0.0 +Phospho@H 79.966331 79.9799 H(1)O(3)P(1) 0.0 Post-translational 21 0.0 +Phospho@C 79.966331 79.9799 H(1)O(3)P(1) 0.0 Post-translational 21 0.0 +Phospho@D 79.966331 79.9799 H(1)O(3)P(1) 0.0 Post-translational 21 0.0 +Phospho@Y 79.966331 79.9799 H(1)O(3)P(1) 0.0 Post-translational 21 C1=CC(=CC=C1CC(C(=O)[Rn])N([Xe])([Xe]))OP(=O)(O)O 0.0 +Phospho@T 79.966331 79.9799 H(1)O(3)P(1) 97.976896 H(3)O(4)P(1) Post-translational 21 CC(C(C(=O)[Rn])N([Xe])([Xe]))OP(=O)(O)O 10000000.0 +Phospho@S 79.966331 79.9799 H(1)O(3)P(1) 97.976896 H(3)O(4)P(1) Post-translational 21 O=P(O)(O)OC[C@@H](C(=O)[Rn])N([Xe])([Xe]) 100000000.0 +Methamidophos-S@Y 108.975121 109.0873 H(4)C(1)N(1)O(1)P(1)S(1) 0.0 Chemical derivative 2007 0.0 +Methamidophos-S@T 108.975121 109.0873 H(4)C(1)N(1)O(1)P(1)S(1) 0.0 Chemical derivative 2007 0.0 +Methamidophos-S@S 108.975121 109.0873 H(4)C(1)N(1)O(1)P(1)S(1) 0.0 Chemical derivative 2007 0.0 +Methamidophos-S@K 108.975121 109.0873 H(4)C(1)N(1)O(1)P(1)S(1) 0.0 Chemical derivative 2007 0.0 +Methamidophos-S@H 108.975121 109.0873 H(4)C(1)N(1)O(1)P(1)S(1) 0.0 Chemical derivative 2007 0.0 +Methamidophos-S@C 108.975121 109.0873 H(4)C(1)N(1)O(1)P(1)S(1) 0.0 Chemical derivative 2007 0.0 +Dehydrated@D -18.010565 -18.0153 H(-2)O(-1) 0.0 Chemical derivative 23 0.0 +Dehydrated@Y -18.010565 -18.0153 H(-2)O(-1) 0.0 Post-translational 23 0.0 +Dehydrated@T -18.010565 -18.0153 H(-2)O(-1) 0.0 Post-translational 23 0.0 +Dehydrated@S -18.010565 -18.0153 H(-2)O(-1) 0.0 Post-translational 23 0.0 +Dehydrated@N^Protein_C-term -18.010565 -18.0153 H(-2)O(-1) 0.0 Post-translational 23 0.0 +Dehydrated@Q^Protein_C-term -18.010565 -18.0153 H(-2)O(-1) 0.0 Post-translational 23 0.0 +Dehydrated@C^Any_N-term -18.010565 -18.0153 H(-2)O(-1) 0.0 Artefact 23 0.0 +Propionamide@C 71.037114 71.0779 H(5)C(3)N(1)O(1) 0.0 Artefact 24 0.0 +Propionamide@K 71.037114 71.0779 H(5)C(3)N(1)O(1) 0.0 Chemical derivative 24 0.0 +Propionamide@Any_N-term 71.037114 71.0779 H(5)C(3)N(1)O(1) 0.0 Chemical derivative 24 CCC(N)=O 0.0 +Pyridylacetyl@Any_N-term 119.037114 119.1207 H(5)C(7)N(1)O(1) 0.0 Chemical derivative 25 C(=O)Cc1ccccn1 0.0 +Pyridylacetyl@K 119.037114 119.1207 H(5)C(7)N(1)O(1) 0.0 Chemical derivative 25 0.0 +Pyro-carbamidomethyl@C^Any_N-term 39.994915 40.0208 C(2)O(1) 0.0 Artefact 26 0.0 +Glu->pyro-Glu@E^Any_N-term -18.010565 -18.0153 H(-2)O(-1) 0.0 Artefact 27 O=C([Rn])[C@H]1N([Xe])C(=O)CC1 0.0 +Gln->pyro-Glu@Q^Any_N-term -17.026549 -17.0305 H(-3)N(-1) 0.0 Artefact 28 O=C([Rn])[C@H]1N([Xe])C(=O)CC1 0.0 +SMA@Any_N-term 127.063329 127.1412 H(9)C(6)N(1)O(2) 0.0 Chemical derivative 29 0.0 +SMA@K 127.063329 127.1412 H(9)C(6)N(1)O(2) 0.0 Chemical derivative 29 0.0 +Cation:Na@D 21.981943 21.9818 H(-1)Na(1) 0.0 Artefact 30 0.0 +Cation:Na@Any_C-term 21.981943 21.9818 H(-1)Na(1) 0.0 Artefact 30 O[Na] 0.0 +Cation:Na@E 21.981943 21.9818 H(-1)Na(1) 0.0 Artefact 30 0.0 +Pyridylethyl@C 105.057849 105.1372 H(7)C(7)N(1) 0.0 Chemical derivative 31 C1=CN=CC=C1CCSCC(C(=O)[Rn])N([Xe])([Xe]) 0.0 +Methyl@E 14.01565 14.0266 H(2)C(1) 0.0 Post-translational 34 0.0 +Methyl@D 14.01565 14.0266 H(2)C(1) 0.0 Post-translational 34 0.0 +Methyl@Any_C-term 14.01565 14.0266 H(2)C(1) 0.0 Multiple 34 OC 0.0 +Methyl@Protein_N-term 14.01565 14.0266 H(2)C(1) 0.0 Post-translational 34 C 0.0 +Methyl@L 14.01565 14.0266 H(2)C(1) 0.0 Post-translational 34 0.0 +Methyl@I 14.01565 14.0266 H(2)C(1) 0.0 Post-translational 34 0.0 +Methyl@R 14.01565 14.0266 H(2)C(1) 0.0 Post-translational 34 0.0 +Methyl@Q 14.01565 14.0266 H(2)C(1) 0.0 Post-translational 34 0.0 +Methyl@Any_N-term 14.01565 14.0266 H(2)C(1) 0.0 Chemical derivative 34 C 0.0 +Methyl@N 14.01565 14.0266 H(2)C(1) 0.0 Post-translational 34 0.0 +Methyl@K 14.01565 14.0266 H(2)C(1) 0.0 Post-translational 34 0.0 +Methyl@H 14.01565 14.0266 H(2)C(1) 0.0 Post-translational 34 0.0 +Methyl@C 14.01565 14.0266 H(2)C(1) 0.0 Post-translational 34 0.0 +Methyl@S 14.01565 14.0266 H(2)C(1) 0.0 Post-translational 34 0.0 +Methyl@T 14.01565 14.0266 H(2)C(1) 0.0 Post-translational 34 0.0 +Oxidation@T 15.994915 15.9994 O(1) 0.0 Chemical derivative 35 0.0 +Oxidation@E 15.994915 15.9994 O(1) 0.0 Chemical derivative 35 0.0 +Oxidation@S 15.994915 15.9994 O(1) 0.0 Chemical derivative 35 0.0 +Oxidation@Q 15.994915 15.9994 O(1) 0.0 Chemical derivative 35 0.0 +Oxidation@L 15.994915 15.9994 O(1) 0.0 Chemical derivative 35 0.0 +Oxidation@I 15.994915 15.9994 O(1) 0.0 Chemical derivative 35 0.0 +Oxidation@U 15.994915 15.9994 O(1) 0.0 Multiple 35 0.0 +Oxidation@G^Any_C-term 15.994915 15.9994 O(1) 0.0 Pre-translational 35 0.0 +Oxidation@W 15.994915 15.9994 O(1) 0.0 Artefact 35 0.0 +Oxidation@C 15.994915 15.9994 O(1) 0.0 Post-translational 35 0.0 +Oxidation@H 15.994915 15.9994 O(1) 0.0 Artefact 35 0.0 +Oxidation@V 15.994915 15.9994 O(1) 0.0 Chemical derivative 35 0.0 +Oxidation@R 15.994915 15.9994 O(1) 0.0 Post-translational 35 0.0 +Oxidation@M 15.994915 15.9994 O(1) 63.998285 H(4)C(1)O(1)S(1) Artefact 35 O=C([Rn])C(N([Xe])([Xe]))CCS(=O)C 0.5 +Oxidation@Y 15.994915 15.9994 O(1) 0.0 Post-translational 35 0.0 +Oxidation@F 15.994915 15.9994 O(1) 0.0 Artefact 35 0.0 +Oxidation@P 15.994915 15.9994 O(1) 0.0 Post-translational 35 0.0 +Oxidation@N 15.994915 15.9994 O(1) 0.0 Post-translational 35 0.0 +Oxidation@K 15.994915 15.9994 O(1) 0.0 Post-translational 35 0.0 +Oxidation@D 15.994915 15.9994 O(1) 0.0 Post-translational 35 0.0 +Dimethyl@Protein_N-term 28.0313 28.0532 H(4)C(2) 0.0 Isotopic label 36 C 0.0 +Dimethyl@P^Protein_N-term 28.0313 28.0532 H(4)C(2) 0.0 Post-translational 36 0.0 +Dimethyl@N 28.0313 28.0532 H(4)C(2) 0.0 Post-translational 36 0.0 +Dimethyl@Any_N-term 28.0313 28.0532 H(4)C(2) 0.0 Isotopic label 36 C 0.0 +Dimethyl@K 28.0313 28.0532 H(4)C(2) 0.0 Multiple 36 CN(C)CCCC[C@H](N([Xe])([Xe]))C(=O)[Rn] 0.0 +Dimethyl@R 28.0313 28.0532 H(4)C(2) 0.0 Post-translational 36 0.0 +Trimethyl@A^Protein_N-term 42.04695 42.0797 H(6)C(3) 0.0 Post-translational 37 0.0 +Trimethyl@R 42.04695 42.0797 H(6)C(3) 0.0 Chemical derivative 37 0.0 +Trimethyl@K 42.04695 42.0797 H(6)C(3) 59.073499 H(9)C(3)N(1) Post-translational 37 0.5 +Methylthio@C 45.987721 46.0916 H(2)C(1)S(1) 0.0 Multiple 39 CSSC[C@H](N([Xe])([Xe]))C([Rn])=O 0.0 +Methylthio@N 45.987721 46.0916 H(2)C(1)S(1) 0.0 Post-translational 39 0.0 +Methylthio@D 45.987721 46.0916 H(2)C(1)S(1) 0.0 Post-translational 39 0.0 +Methylthio@K 45.987721 46.0916 H(2)C(1)S(1) 0.0 Artefact 39 0.0 +Methylthio@Any_N-term 45.987721 46.0916 H(2)C(1)S(1) 0.0 Artefact 39 0.0 +Sulfo@S 79.956815 80.0632 O(3)S(1) 79.956815 O(3)S(1) Post-translational 40 0.5 +Sulfo@T 79.956815 80.0632 O(3)S(1) 79.956815 O(3)S(1) Post-translational 40 0.5 +Sulfo@Y 79.956815 80.0632 O(3)S(1) 79.956815 O(3)S(1) Post-translational 40 0.5 +Sulfo@C 79.956815 80.0632 O(3)S(1) 0.0 Post-translational 40 0.0 +Hex@C 162.052824 162.1406 H(10)C(6)O(5) 0.0 Other glycosylation 41 0.0 +Hex@W 162.052824 162.1406 H(10)C(6)O(5) 0.0 Other glycosylation 41 0.0 +Hex@T 162.052824 162.1406 H(10)C(6)O(5) 162.052824 H(10)C(6)O(5) O-linked glycosylation 41 0.5 +Hex@S 162.052824 162.1406 H(10)C(6)O(5) 162.052824 H(10)C(6)O(5) O-linked glycosylation 41 0.5 +Hex@Any_N-term 162.052824 162.1406 H(10)C(6)O(5) 54.031694 H(6)O(3) Other glycosylation 41 0.5 +Hex@N 162.052824 162.1406 H(10)C(6)O(5) 162.052824 H(10)C(6)O(5) N-linked glycosylation 41 0.5 +Hex@R 162.052824 162.1406 H(10)C(6)O(5) 54.031694 H(6)O(3) Other glycosylation 41 0.5 +Hex@K 162.052824 162.1406 H(10)C(6)O(5) 54.031694 H(6)O(3) Other glycosylation 41 0.5 +Hex@Y 162.052824 162.1406 H(10)C(6)O(5) 0.0 O-linked glycosylation 41 0.0 +Lipoyl@K 188.032956 188.3103 H(12)C(8)O(1)S(2) 0.0 Post-translational 42 0.0 +HexNAc@C 203.079373 203.1925 H(13)C(8)N(1)O(5) 203.079373 H(13)C(8)N(1)O(5) Other glycosylation 43 0.5 +HexNAc@T 203.079373 203.1925 H(13)C(8)N(1)O(5) 203.079373 H(13)C(8)N(1)O(5) O-linked glycosylation 43 0.5 +HexNAc@S 203.079373 203.1925 H(13)C(8)N(1)O(5) 203.079373 H(13)C(8)N(1)O(5) O-linked glycosylation 43 0.5 +HexNAc@N 203.079373 203.1925 H(13)C(8)N(1)O(5) 203.079373 H(13)C(8)N(1)O(5) N-linked glycosylation 43 0.5 +Farnesyl@C 204.187801 204.3511 H(24)C(15) 0.0 Post-translational 44 0.0 +Myristoyl@C 210.198366 210.3556 H(26)C(14)O(1) 0.0 Post-translational 45 0.0 +Myristoyl@K 210.198366 210.3556 H(26)C(14)O(1) 0.0 Post-translational 45 0.0 +Myristoyl@G^Any_N-term 210.198366 210.3556 H(26)C(14)O(1) 0.0 Post-translational 45 0.0 +PyridoxalPhosphate@K 229.014009 229.1266 H(8)C(8)N(1)O(5)P(1) 0.0 Post-translational 46 0.0 +Palmitoyl@T 238.229666 238.4088 H(30)C(16)O(1) 0.0 Post-translational 47 0.0 +Palmitoyl@S 238.229666 238.4088 H(30)C(16)O(1) 0.0 Post-translational 47 0.0 +Palmitoyl@K 238.229666 238.4088 H(30)C(16)O(1) 0.0 Post-translational 47 0.0 +Palmitoyl@C 238.229666 238.4088 H(30)C(16)O(1) 0.0 Post-translational 47 0.0 +Palmitoyl@Protein_N-term 238.229666 238.4088 H(30)C(16)O(1) 0.0 Post-translational 47 0.0 +GeranylGeranyl@C 272.250401 272.4681 H(32)C(20) 0.0 Post-translational 48 0.0 +Phosphopantetheine@S 340.085794 340.333 H(21)C(11)N(2)O(6)P(1)S(1) 0.0 Post-translational 49 0.0 +FAD@Y 783.141486 783.5339 H(31)C(27)N(9)O(15)P(2) 0.0 Post-translational 50 0.0 +FAD@H 783.141486 783.5339 H(31)C(27)N(9)O(15)P(2) 0.0 Post-translational 50 0.0 +FAD@C 783.141486 783.5339 H(31)C(27)N(9)O(15)P(2) 0.0 Post-translational 50 0.0 +Tripalmitate@C^Protein_N-term 788.725777 789.3049 H(96)C(51)O(5) 0.0 Post-translational 51 0.0 +Guanidinyl@K 42.021798 42.04 H(2)C(1)N(2) 0.0 Chemical derivative 52 0.0 +Guanidinyl@Any_N-term 42.021798 42.04 H(2)C(1)N(2) 0.0 Chemical derivative 52 0.0 +HNE@K 156.11503 156.2221 H(16)C(9)O(2) 0.0 Post-translational 53 0.0 +HNE@H 156.11503 156.2221 H(16)C(9)O(2) 0.0 Post-translational 53 0.0 +HNE@C 156.11503 156.2221 H(16)C(9)O(2) 0.0 Post-translational 53 0.0 +HNE@A 156.11503 156.2221 H(16)C(9)O(2) 0.0 Post-translational 53 0.0 +HNE@L 156.11503 156.2221 H(16)C(9)O(2) 0.0 Post-translational 53 0.0 +Glucuronyl@T 176.032088 176.1241 H(8)C(6)O(6) 176.032088 H(8)C(6)O(6) O-linked glycosylation 54 0.5 +Glucuronyl@S 176.032088 176.1241 H(8)C(6)O(6) 176.032088 H(8)C(6)O(6) O-linked glycosylation 54 0.5 +Glucuronyl@Protein_N-term 176.032088 176.1241 H(8)C(6)O(6) 0.0 Other glycosylation 54 0.0 +Glutathione@C 305.068156 305.3076 H(15)C(10)N(3)O(6)S(1) 0.0 Post-translational 55 0.0 +Acetyl:2H(3)@Y 45.029395 45.0552 H(-1)2H(3)C(2)O(1) 0.0 Isotopic label 56 0.0 +Acetyl:2H(3)@T 45.029395 45.0552 H(-1)2H(3)C(2)O(1) 0.0 Isotopic label 56 0.0 +Acetyl:2H(3)@S 45.029395 45.0552 H(-1)2H(3)C(2)O(1) 0.0 Isotopic label 56 0.0 +Acetyl:2H(3)@H 45.029395 45.0552 H(-1)2H(3)C(2)O(1) 0.0 Isotopic label 56 0.0 +Acetyl:2H(3)@Any_N-term 45.029395 45.0552 H(-1)2H(3)C(2)O(1) 0.0 Isotopic label 56 0.0 +Acetyl:2H(3)@K 45.029395 45.0552 H(-1)2H(3)C(2)O(1) 0.0 Isotopic label 56 0.0 +Acetyl:2H(3)@Protein_N-term 45.029395 45.0552 H(-1)2H(3)C(2)O(1) 0.0 Isotopic label 56 0.0 +Propionyl@Protein_N-term 56.026215 56.0633 H(4)C(3)O(1) 0.0 Multiple 58 C(=O)CC 0.0 +Propionyl@T 56.026215 56.0633 H(4)C(3)O(1) 0.0 Isotopic label 58 0.0 +Propionyl@S 56.026215 56.0633 H(4)C(3)O(1) 0.0 Chemical derivative 58 0.0 +Propionyl@K 56.026215 56.0633 H(4)C(3)O(1) 0.0 Isotopic label 58 CCC(=O)NCCCCC(C(=O)[Rn])N([Xe])([Xe]) 0.0 +Propionyl@Any_N-term 56.026215 56.0633 H(4)C(3)O(1) 0.0 Isotopic label 58 C(=O)CC 0.0 +Propionyl:13C(3)@Any_N-term 59.036279 59.0412 H(4)13C(3)O(1) 0.0 Isotopic label 59 0.0 +Propionyl:13C(3)@K 59.036279 59.0412 H(4)13C(3)O(1) 0.0 Isotopic label 59 0.0 +GIST-Quat@Any_N-term 127.099714 127.1842 H(13)C(7)N(1)O(1) 59.073499 H(9)C(3)N(1) Isotopic label 60 0.5 +GIST-Quat@K 127.099714 127.1842 H(13)C(7)N(1)O(1) 59.073499 H(9)C(3)N(1) Isotopic label 60 0.5 +GIST-Quat:2H(3)@Any_N-term 130.118544 130.2027 H(10)2H(3)C(7)N(1)O(1) 62.09233 H(6)2H(3)C(3)N(1) Isotopic label 61 0.5 +GIST-Quat:2H(3)@K 130.118544 130.2027 H(10)2H(3)C(7)N(1)O(1) 62.09233 H(6)2H(3)C(3)N(1) Isotopic label 61 0.5 +GIST-Quat:2H(6)@Any_N-term 133.137375 133.2212 H(7)2H(6)C(7)N(1)O(1) 65.11116 H(3)2H(6)C(3)N(1) Isotopic label 62 0.5 +GIST-Quat:2H(6)@K 133.137375 133.2212 H(7)2H(6)C(7)N(1)O(1) 65.11116 H(3)2H(6)C(3)N(1) Isotopic label 62 0.5 +GIST-Quat:2H(9)@Any_N-term 136.156205 136.2397 H(4)2H(9)C(7)N(1)O(1) 68.12999 2H(9)C(3)N(1) Isotopic label 63 0.5 +GIST-Quat:2H(9)@K 136.156205 136.2397 H(4)2H(9)C(7)N(1)O(1) 68.12999 2H(9)C(3)N(1) Isotopic label 63 0.5 +Succinyl@Protein_N-term 100.016044 100.0728 H(4)C(4)O(3) 0.0 Post-translational 64 0.0 +Succinyl@Any_N-term 100.016044 100.0728 H(4)C(4)O(3) 0.0 Isotopic label 64 0.0 +Succinyl@K 100.016044 100.0728 H(4)C(4)O(3) 0.0 Isotopic label 64 C(CCN)CC(C(=O)[Rn])N([Xe])C(=O)CCC(=O)O 0.0 +Succinyl:2H(4)@Any_N-term 104.041151 104.0974 2H(4)C(4)O(3) 0.0 Isotopic label 65 0.0 +Succinyl:2H(4)@K 104.041151 104.0974 2H(4)C(4)O(3) 0.0 Isotopic label 65 0.0 +Succinyl:13C(4)@Any_N-term 104.029463 104.0434 H(4)13C(4)O(3) 0.0 Isotopic label 66 0.0 +Succinyl:13C(4)@K 104.029463 104.0434 H(4)13C(4)O(3) 0.0 Isotopic label 66 0.0 +probiotinhydrazide@P 258.115047 258.3405 H(18)C(10)N(4)O(2)S(1) 0.0 Chemical derivative 357 0.0 +Pro->pyro-Glu@P 13.979265 13.9835 H(-2)O(1) 0.0 Chemical derivative 359 0.0 +His->Asn@H -23.015984 -23.0366 H(-1)C(-2)N(-1)O(1) 0.0 AA substitution 348 0.0 +His->Asp@H -22.031969 -22.0519 H(-2)C(-2)N(-2)O(2) 0.0 AA substitution 349 0.0 +Trp->Hydroxykynurenin@W 19.989829 19.9881 C(-1)O(2) 0.0 Chemical derivative 350 0.0 +Delta:H(4)C(3)@K 40.0313 40.0639 H(4)C(3) 0.0 Other 256 0.0 +Delta:H(4)C(3)@H 40.0313 40.0639 H(4)C(3) 0.0 Other 256 0.0 +Delta:H(4)C(3)@Protein_N-term 40.0313 40.0639 H(4)C(3) 0.0 Other 256 0.0 +Delta:H(4)C(2)@K 28.0313 28.0532 H(4)C(2) 0.0 Other 255 0.0 +Delta:H(4)C(2)@H 28.0313 28.0532 H(4)C(2) 0.0 Other 255 0.0 +Delta:H(4)C(2)@Any_N-term 28.0313 28.0532 H(4)C(2) 0.0 Other 255 0.0 +Cys->Dha@C -33.987721 -34.0809 H(-2)S(-1) 0.0 Chemical derivative 368 0.0 +Arg->GluSA@R -43.053433 -43.0711 H(-5)C(-1)N(-3)O(1) 0.0 Chemical derivative 344 0.0 +Trioxidation@Y 47.984744 47.9982 O(3) 0.0 Chemical derivative 345 0.0 +Trioxidation@W 47.984744 47.9982 O(3) 0.0 Chemical derivative 345 0.0 +Trioxidation@C 47.984744 47.9982 O(3) 0.0 Chemical derivative 345 0.0 +Trioxidation@F 47.984744 47.9982 O(3) 0.0 Artefact 345 0.0 +Iminobiotin@Any_N-term 225.093583 225.3106 H(15)C(10)N(3)O(1)S(1) 0.0 Chemical derivative 89 0.0 +Iminobiotin@K 225.093583 225.3106 H(15)C(10)N(3)O(1)S(1) 0.0 Chemical derivative 89 0.0 +ESP@Any_N-term 338.177647 338.4682 H(26)C(16)N(4)O(2)S(1) 0.0 Isotopic label 90 0.0 +ESP@K 338.177647 338.4682 H(26)C(16)N(4)O(2)S(1) 0.0 Isotopic label 90 0.0 +ESP:2H(10)@Any_N-term 348.240414 348.5299 H(16)2H(10)C(16)N(4)O(2)S(1) 0.0 Isotopic label 91 0.0 +ESP:2H(10)@K 348.240414 348.5299 H(16)2H(10)C(16)N(4)O(2)S(1) 0.0 Isotopic label 91 0.0 +NHS-LC-Biotin@Any_N-term 339.161662 339.453 H(25)C(16)N(3)O(3)S(1) 0.0 Chemical derivative 92 0.0 +NHS-LC-Biotin@K 339.161662 339.453 H(25)C(16)N(3)O(3)S(1) 0.0 Chemical derivative 92 0.0 +EDT-maleimide-PEO-biotin@T 601.206246 601.8021 H(39)C(25)N(5)O(6)S(3) 0.0 Chemical derivative 93 0.0 +EDT-maleimide-PEO-biotin@S 601.206246 601.8021 H(39)C(25)N(5)O(6)S(3) 0.0 Chemical derivative 93 0.0 +IMID@K 68.037448 68.0773 H(4)C(3)N(2) 0.0 Isotopic label 94 0.0 +IMID:2H(4)@K 72.062555 72.1019 2H(4)C(3)N(2) 0.0 Isotopic label 95 0.0 +Lysbiotinhydrazide@K 241.088497 241.31 H(15)C(10)N(3)O(2)S(1) 0.0 Chemical derivative 353 0.0 +Propionamide:2H(3)@C 74.055944 74.0964 H(2)2H(3)C(3)N(1)O(1) 0.0 Isotopic label 97 0.0 +Nitro@Y 44.985078 44.9976 H(-1)N(1)O(2) 0.0 Chemical derivative 354 O=[N+]([O-])c1cc(ccc1O)C[C@@H](C(=O)[Rn])N([Xe])([Xe]) 0.0 +Nitro@W 44.985078 44.9976 H(-1)N(1)O(2) 0.0 Chemical derivative 354 0.0 +Nitro@F 44.985078 44.9976 H(-1)N(1)O(2) 0.0 Artefact 354 0.0 +ICAT-C@C 227.126991 227.2603 H(17)C(10)N(3)O(3) 0.0 Isotopic label 105 0.0 +Delta:H(2)C(2)@Protein_N-term 26.01565 26.0373 H(2)C(2) 0.0 Other 254 0.0 +Delta:H(2)C(2)@K 26.01565 26.0373 H(2)C(2) 0.0 Other 254 0.0 +Delta:H(2)C(2)@H 26.01565 26.0373 H(2)C(2) 0.0 Other 254 0.0 +Delta:H(2)C(2)@Any_N-term 26.01565 26.0373 H(2)C(2) 0.0 Other 254 0.0 +Trp->Kynurenin@W 3.994915 3.9887 C(-1)O(1) 0.0 Chemical derivative 351 0.0 +Lys->Allysine@K -1.031634 -1.0311 H(-3)N(-1)O(1) 0.0 Post-translational 352 0.0 +ICAT-C:13C(9)@C 236.157185 236.1942 H(17)C(1)13C(9)N(3)O(3) 0.0 Isotopic label 106 0.0 +FormylMet@Protein_N-term 159.035399 159.2062 H(9)C(6)N(1)O(2)S(1) 0.0 Pre-translational 107 0.0 +Nethylmaleimide@C 125.047679 125.1253 H(7)C(6)N(1)O(2) 0.0 Chemical derivative 108 0.0 +OxLysBiotinRed@K 354.172562 354.4676 H(26)C(16)N(4)O(3)S(1) 0.0 Chemical derivative 112 0.0 +IBTP@C 316.138088 316.3759 H(21)C(22)P(1) 0.0 Chemical derivative 119 0.0 +OxLysBiotin@K 352.156911 352.4518 H(24)C(16)N(4)O(3)S(1) 0.0 Chemical derivative 113 0.0 +OxProBiotinRed@P 371.199111 371.4982 H(29)C(16)N(5)O(3)S(1) 0.0 Chemical derivative 114 0.0 +OxProBiotin@P 369.183461 369.4823 H(27)C(16)N(5)O(3)S(1) 0.0 Chemical derivative 115 0.0 +OxArgBiotin@R 310.135113 310.4118 H(22)C(15)N(2)O(3)S(1) 0.0 Chemical derivative 116 0.0 +OxArgBiotinRed@R 312.150763 312.4277 H(24)C(15)N(2)O(3)S(1) 0.0 Chemical derivative 117 0.0 +EDT-iodoacetyl-PEO-biotin@T 490.174218 490.7034 H(34)C(20)N(4)O(4)S(3) 0.0 Chemical derivative 118 0.0 +EDT-iodoacetyl-PEO-biotin@S 490.174218 490.7034 H(34)C(20)N(4)O(4)S(3) 0.0 Chemical derivative 118 0.0 +GG@C 114.042927 114.1026 H(6)C(4)N(2)O(2) 0.0 Other 121 0.0 +GG@T 114.042927 114.1026 H(6)C(4)N(2)O(2) 0.0 Other 121 0.0 +GG@S 114.042927 114.1026 H(6)C(4)N(2)O(2) 0.0 Other 121 0.0 +GG@K 114.042927 114.1026 H(6)C(4)N(2)O(2) 0.0 Other 121 1000000.0 +GG@Protein_N-term 114.042927 114.1026 H(6)C(4)N(2)O(2) 0.0 Post-translational 121 0.0 +Formyl@Protein_N-term 27.994915 28.0101 C(1)O(1) 0.0 Post-translational 122 0.0 +Formyl@T 27.994915 28.0101 C(1)O(1) 0.0 Artefact 122 0.0 +Formyl@K 27.994915 28.0101 C(1)O(1) 0.0 Artefact 122 0.0 +Formyl@Any_N-term 27.994915 28.0101 C(1)O(1) 0.0 Artefact 122 0.0 +Formyl@S 27.994915 28.0101 C(1)O(1) 0.0 Artefact 122 0.0 +ICAT-H@C 345.097915 345.7754 H(20)C(15)N(1)O(6)Cl(1) 0.0 Isotopic label 123 0.0 +ICAT-H:13C(6)@C 351.118044 351.7313 H(20)C(9)13C(6)N(1)O(6)Cl(1) 0.0 Isotopic label 124 0.0 +Cation:K@Any_C-term 37.955882 38.0904 H(-1)K(1) 0.0 Artefact 530 O[K] 0.0 +Cation:K@E 37.955882 38.0904 H(-1)K(1) 0.0 Artefact 530 0.0 +Cation:K@D 37.955882 38.0904 H(-1)K(1) 0.0 Artefact 530 0.0 +Xlink:DTSSP[88]@Protein_N-term 87.998285 88.1283 H(4)C(3)O(1)S(1) 0.0 Chemical derivative 126 0.0 +Xlink:DTSSP[88]@K 87.998285 88.1283 H(4)C(3)O(1)S(1) 0.0 Chemical derivative 126 0.0 +Xlink:EGS[226]@K 226.047738 226.1828 H(10)C(10)O(6) 0.0 Chemical derivative 1897 0.0 +Xlink:EGS[226]@Protein_N-term 226.047738 226.1828 H(10)C(10)O(6) 0.0 Chemical derivative 1897 0.0 +Fluoro@Y 17.990578 17.9905 H(-1)F(1) 0.0 Non-standard residue 127 0.0 +Fluoro@W 17.990578 17.9905 H(-1)F(1) 0.0 Non-standard residue 127 0.0 +Fluoro@F 17.990578 17.9905 H(-1)F(1) 0.0 Non-standard residue 127 0.0 +Fluoro@A 17.990578 17.9905 H(-1)F(1) 0.0 Chemical derivative 127 0.0 +Fluorescein@C 387.074287 387.3417 H(13)C(22)N(1)O(6) 0.0 Chemical derivative 128 0.0 +Iodo@H 125.896648 125.8965 H(-1)I(1) 0.0 Chemical derivative 129 0.0 +Iodo@Y 125.896648 125.8965 H(-1)I(1) 0.0 Chemical derivative 129 0.0 +Diiodo@Y 251.793296 251.7931 H(-2)I(2) 0.0 Chemical derivative 130 0.0 +Diiodo@H 251.793296 251.7931 H(-2)I(2) 0.0 Chemical derivative 130 0.0 +Triiodo@Y 377.689944 377.6896 H(-3)I(3) 0.0 Chemical derivative 131 0.0 +Myristoleyl@G^Protein_N-term 208.182715 208.3398 H(24)C(14)O(1) 0.0 Co-translational 134 0.0 +Pro->Pyrrolidinone@P -30.010565 -30.026 H(-2)C(-1)O(-1) 0.0 Chemical derivative 360 0.0 +Myristoyl+Delta:H(-4)@G^Protein_N-term 206.167065 206.3239 H(22)C(14)O(1) 0.0 Co-translational 135 0.0 +Benzoyl@Any_N-term 104.026215 104.1061 H(4)C(7)O(1) 0.0 Isotopic label 136 0.0 +Benzoyl@K 104.026215 104.1061 H(4)C(7)O(1) 0.0 Isotopic label 136 0.0 +Hex(5)HexNAc(2)@N 1216.422863 1217.088 H(76)C(46)N(2)O(35) 1216.422863 H(76)C(46)N(2)O(35) N-linked glycosylation 137 0.5 +Dansyl@Any_N-term 233.051049 233.2862 H(11)C(12)N(1)O(2)S(1) 0.0 Chemical derivative 139 0.0 +Dansyl@K 233.051049 233.2862 H(11)C(12)N(1)O(2)S(1) 0.0 Chemical derivative 139 0.0 +a-type-ion@Any_C-term -46.005479 -46.0254 H(-2)C(-1)O(-2) 0.0 Other 140 0.0 +Amidine@Any_N-term 41.026549 41.0519 H(3)C(2)N(1) 0.0 Chemical derivative 141 0.0 +Amidine@K 41.026549 41.0519 H(3)C(2)N(1) 0.0 Chemical derivative 141 0.0 +HexNAc(1)dHex(1)@T 349.137281 349.3337 H(23)C(14)N(1)O(9) 349.137281 H(23)C(14)N(1)O(9) O-linked glycosylation 142 0.5 +HexNAc(1)dHex(1)@S 349.137281 349.3337 H(23)C(14)N(1)O(9) 349.137281 H(23)C(14)N(1)O(9) O-linked glycosylation 142 0.5 +HexNAc(1)dHex(1)@N 349.137281 349.3337 H(23)C(14)N(1)O(9) 349.137281 H(23)C(14)N(1)O(9) N-linked glycosylation 142 0.5 +HexNAc(2)@T 406.158745 406.385 H(26)C(16)N(2)O(10) 406.158745 H(26)C(16)N(2)O(10) O-linked glycosylation 143 0.5 +HexNAc(2)@S 406.158745 406.385 H(26)C(16)N(2)O(10) 406.158745 H(26)C(16)N(2)O(10) O-linked glycosylation 143 0.5 +HexNAc(2)@N 406.158745 406.385 H(26)C(16)N(2)O(10) 406.158745 H(26)C(16)N(2)O(10) N-linked glycosylation 143 0.5 +Hex(3)@T 486.158471 486.4218 H(30)C(18)O(15) 486.158471 H(30)C(18)O(15) O-linked glycosylation 144 0.5 +Hex(3)@S 486.158471 486.4218 H(30)C(18)O(15) 486.158471 H(30)C(18)O(15) O-linked glycosylation 144 0.5 +Hex(3)@N 486.158471 486.4218 H(30)C(18)O(15) 486.158471 H(30)C(18)O(15) N-linked glycosylation 144 0.5 +HexNAc(1)dHex(2)@N 495.19519 495.4749 H(33)C(20)N(1)O(13) 495.19519 H(33)C(20)N(1)O(13) N-linked glycosylation 145 0.5 +Hex(1)HexNAc(1)dHex(1)@T 511.190105 511.4743 H(33)C(20)N(1)O(14) 511.190105 H(33)C(20)N(1)O(14) O-linked glycosylation 146 0.5 +Hex(1)HexNAc(1)dHex(1)@S 511.190105 511.4743 H(33)C(20)N(1)O(14) 511.190105 H(33)C(20)N(1)O(14) O-linked glycosylation 146 0.5 +Hex(1)HexNAc(1)dHex(1)@N 511.190105 511.4743 H(33)C(20)N(1)O(14) 511.190105 H(33)C(20)N(1)O(14) N-linked glycosylation 146 0.5 +HexNAc(2)dHex(1)@N 552.216654 552.5262 H(36)C(22)N(2)O(14) 552.216654 H(36)C(22)N(2)O(14) N-linked glycosylation 147 0.5 +Hex(1)HexNAc(2)@T 568.211569 568.5256 H(36)C(22)N(2)O(15) 568.211569 H(36)C(22)N(2)O(15) O-linked glycosylation 148 0.5 +Hex(1)HexNAc(2)@S 568.211569 568.5256 H(36)C(22)N(2)O(15) 568.211569 H(36)C(22)N(2)O(15) O-linked glycosylation 148 0.5 +Hex(1)HexNAc(2)@N 568.211569 568.5256 H(36)C(22)N(2)O(15) 568.211569 H(36)C(22)N(2)O(15) N-linked glycosylation 148 0.5 +Hex(1)HexNAc(1)NeuAc(1)@T 656.227613 656.5877 H(40)C(25)N(2)O(18) 656.227613 H(40)C(25)N(2)O(18) O-linked glycosylation 149 0.5 +Hex(1)HexNAc(1)NeuAc(1)@S 656.227613 656.5877 H(40)C(25)N(2)O(18) 656.227613 H(40)C(25)N(2)O(18) O-linked glycosylation 149 0.5 +Hex(1)HexNAc(1)NeuAc(1)@N 656.227613 656.5877 H(40)C(25)N(2)O(18) 656.227613 H(40)C(25)N(2)O(18) N-linked glycosylation 149 0.5 +HexNAc(2)dHex(2)@N 698.274563 698.6674 H(46)C(28)N(2)O(18) 698.274563 H(46)C(28)N(2)O(18) N-linked glycosylation 150 0.5 +Hex(1)HexNAc(2)Pent(1)@N 700.253828 700.6403 H(44)C(27)N(2)O(19) 700.253828 H(44)C(27)N(2)O(19) N-linked glycosylation 151 0.5 +Hex(1)HexNAc(2)dHex(1)@T 714.269478 714.6668 H(46)C(28)N(2)O(19) 714.269478 H(46)C(28)N(2)O(19) O-linked glycosylation 152 0.5 +Hex(1)HexNAc(2)dHex(1)@S 714.269478 714.6668 H(46)C(28)N(2)O(19) 714.269478 H(46)C(28)N(2)O(19) O-linked glycosylation 152 0.5 +Hex(1)HexNAc(2)dHex(1)@N 714.269478 714.6668 H(46)C(28)N(2)O(19) 714.269478 H(46)C(28)N(2)O(19) N-linked glycosylation 152 0.5 +Hex(2)HexNAc(2)@T 730.264392 730.6662 H(46)C(28)N(2)O(20) 730.264392 H(46)C(28)N(2)O(20) O-linked glycosylation 153 0.5 +Hex(2)HexNAc(2)@S 730.264392 730.6662 H(46)C(28)N(2)O(20) 730.264392 H(46)C(28)N(2)O(20) O-linked glycosylation 153 0.5 +Hex(2)HexNAc(2)@N 730.264392 730.6662 H(46)C(28)N(2)O(20) 730.264392 H(46)C(28)N(2)O(20) N-linked glycosylation 153 0.5 +Hex(3)HexNAc(1)Pent(1)@N 821.280102 821.7289 H(51)C(31)N(1)O(24) 821.280102 H(51)C(31)N(1)O(24) N-linked glycosylation 154 0.5 +Hex(1)HexNAc(2)dHex(1)Pent(1)@N 846.311736 846.7815 H(54)C(33)N(2)O(23) 846.311736 H(54)C(33)N(2)O(23) N-linked glycosylation 155 0.5 +Hex(1)HexNAc(2)dHex(2)@T 860.327386 860.808 H(56)C(34)N(2)O(23) 860.327386 H(56)C(34)N(2)O(23) O-linked glycosylation 156 0.5 +Hex(1)HexNAc(2)dHex(2)@S 860.327386 860.808 H(56)C(34)N(2)O(23) 860.327386 H(56)C(34)N(2)O(23) O-linked glycosylation 156 0.5 +Hex(1)HexNAc(2)dHex(2)@N 860.327386 860.808 H(56)C(34)N(2)O(23) 860.327386 H(56)C(34)N(2)O(23) N-linked glycosylation 156 0.5 +Hex(2)HexNAc(2)Pent(1)@N 862.306651 862.7809 H(54)C(33)N(2)O(24) 862.306651 H(54)C(33)N(2)O(24) N-linked glycosylation 157 0.5 +Hex(2)HexNAc(2)dHex(1)@T 876.322301 876.8074 H(56)C(34)N(2)O(24) 876.322301 H(56)C(34)N(2)O(24) O-linked glycosylation 158 0.5 +Hex(2)HexNAc(2)dHex(1)@S 876.322301 876.8074 H(56)C(34)N(2)O(24) 876.322301 H(56)C(34)N(2)O(24) O-linked glycosylation 158 0.5 +Hex(2)HexNAc(2)dHex(1)@N 876.322301 876.8074 H(56)C(34)N(2)O(24) 876.322301 H(56)C(34)N(2)O(24) N-linked glycosylation 158 0.5 +Hex(3)HexNAc(2)@T 892.317216 892.8068 H(56)C(34)N(2)O(25) 892.317216 H(56)C(34)N(2)O(25) O-linked glycosylation 159 0.5 +Hex(3)HexNAc(2)@S 892.317216 892.8068 H(56)C(34)N(2)O(25) 892.317216 H(56)C(34)N(2)O(25) O-linked glycosylation 159 0.5 +Hex(3)HexNAc(2)@N 892.317216 892.8068 H(56)C(34)N(2)O(25) 892.317216 H(56)C(34)N(2)O(25) N-linked glycosylation 159 0.5 +Hex(1)HexNAc(1)NeuAc(2)@T 947.323029 947.8423 H(57)C(36)N(3)O(26) 947.323029 H(57)C(36)N(3)O(26) O-linked glycosylation 160 0.5 +Hex(1)HexNAc(1)NeuAc(2)@S 947.323029 947.8423 H(57)C(36)N(3)O(26) 947.323029 H(57)C(36)N(3)O(26) O-linked glycosylation 160 0.5 +Hex(1)HexNAc(1)NeuAc(2)@N 947.323029 947.8423 H(57)C(36)N(3)O(26) 947.323029 H(57)C(36)N(3)O(26) N-linked glycosylation 160 0.5 +Hex(3)HexNAc(2)Phos(1)@N 972.283547 972.7867 H(57)C(34)N(2)O(28)P(1) 972.283547 H(57)C(34)N(2)O(28)P(1) N-linked glycosylation 161 0.5 +Delta:S(-1)Se(1)@M 47.944449 46.895 S(-1)Se(1) 0.0 Non-standard residue 162 0.0 +Delta:S(-1)Se(1)@C 47.944449 46.895 S(-1)Se(1) 0.0 Non-standard residue 162 0.0 +NBS:13C(6)@W 159.008578 159.1144 H(3)13C(6)N(1)O(2)S(1) 0.0 Chemical derivative 171 0.0 +Methyl:2H(3)13C(1)@K 18.037835 18.0377 H(-1)2H(3)13C(1) 0.0 Isotopic label 329 0.0 +Methyl:2H(3)13C(1)@R 18.037835 18.0377 H(-1)2H(3)13C(1) 0.0 Isotopic label 329 0.0 +Methyl:2H(3)13C(1)@Any_N-term 18.037835 18.0377 H(-1)2H(3)13C(1) 0.0 Isotopic label 329 0.0 +Dimethyl:2H(6)13C(2)@Protein_N-term 36.07567 36.0754 H(-2)2H(6)13C(2) 0.0 Isotopic label 330 [13C]([2H])([2H])([2H]) 0.0 +Dimethyl:2H(6)13C(2)@Any_N-term 36.07567 36.0754 H(-2)2H(6)13C(2) 0.0 Isotopic label 330 [13C]([2H])([2H])([2H]) 0.0 +Dimethyl:2H(6)13C(2)@R 36.07567 36.0754 H(-2)2H(6)13C(2) 0.0 Isotopic label 330 0.0 +Dimethyl:2H(6)13C(2)@K 36.07567 36.0754 H(-2)2H(6)13C(2) 0.0 Isotopic label 330 0.0 +NBS@W 152.988449 153.1585 H(3)C(6)N(1)O(2)S(1) 0.0 Chemical derivative 172 0.0 +Delta:H(-1)N(-1)18O(1)@N 2.988261 2.9845 H(-1)N(-1)18O(1) 0.0 Isotopic label 170 0.0 +QAT@C 171.149738 171.26 H(19)C(9)N(2)O(1) 0.0 Chemical derivative 195 0.0 +BHT@H 218.167065 218.3346 H(22)C(15)O(1) 0.0 Other 176 0.0 +BHT@K 218.167065 218.3346 H(22)C(15)O(1) 0.0 Other 176 0.0 +BHT@C 218.167065 218.3346 H(22)C(15)O(1) 0.0 Other 176 0.0 +Delta:H(4)C(2)O(-1)S(1)@S 44.008456 44.1188 H(4)C(2)O(-1)S(1) 0.0 Chemical derivative 327 0.0 +DAET@T 87.050655 87.1866 H(9)C(4)N(1)O(-1)S(1) 0.0 Chemical derivative 178 0.0 +DAET@S 87.050655 87.1866 H(9)C(4)N(1)O(-1)S(1) 0.0 Chemical derivative 178 0.0 +Pro->Pyrrolidone@P -27.994915 -28.0101 C(-1)O(-1) 0.0 Chemical derivative 369 0.0 +Label:13C(9)@Y 9.030193 8.9339 C(-9)13C(9) 0.0 Isotopic label 184 0.0 +Label:13C(9)@F 9.030193 8.9339 C(-9)13C(9) 0.0 Isotopic label 184 0.0 +Label:13C(9)+Phospho@Y 88.996524 88.9138 H(1)C(-9)13C(9)O(3)P(1) 0.0 Isotopic label 185 0.0 +Label:13C(6)@I 6.020129 5.9559 C(-6)13C(6) 0.0 Isotopic label 188 0.0 +Label:13C(6)@L 6.020129 5.9559 C(-6)13C(6) 0.0 Isotopic label 188 0.0 +Label:13C(6)@K 6.020129 5.9559 C(-6)13C(6) 0.0 Isotopic label 188 0.0 +Label:13C(6)@R 6.020129 5.9559 C(-6)13C(6) 0.0 Isotopic label 188 0.0 +HPG@R 132.021129 132.1162 H(4)C(8)O(2) 0.0 Chemical derivative 186 0.0 +2HPG@R 282.052824 282.2476 H(10)C(16)O(5) 0.0 Chemical derivative 187 0.0 +QAT:2H(3)@C 174.168569 174.2784 H(16)2H(3)C(9)N(2)O(1) 0.0 Isotopic label 196 0.0 +Label:18O(2)@Any_C-term 4.008491 3.9995 O(-2)18O(2) 0.0 Isotopic label 193 0.0 +AccQTag@Any_N-term 170.048013 170.1674 H(6)C(10)N(2)O(1) 0.0 Chemical derivative 194 0.0 +AccQTag@K 170.048013 170.1674 H(6)C(10)N(2)O(1) 0.0 Chemical derivative 194 0.0 +Dimethyl:2H(4)@Protein_N-term 32.056407 32.0778 2H(4)C(2) 0.0 Isotopic label 199 C([2H])([2H])([1H]) 0.0 +Dimethyl:2H(4)@Any_N-term 32.056407 32.0778 2H(4)C(2) 0.0 Isotopic label 199 C([2H])([2H])([1H]) 0.0 +Dimethyl:2H(4)@K 32.056407 32.0778 2H(4)C(2) 0.0 Isotopic label 199 0.0 +Dimethyl:2H(4)@R 32.056407 32.0778 2H(4)C(2) 0.0 Isotopic label 199 0.0 +EQAT@C 184.157563 184.2786 H(20)C(10)N(2)O(1) 0.0 Chemical derivative 197 0.0 +EQAT:2H(5)@C 189.188947 189.3094 H(15)2H(5)C(10)N(2)O(1) 0.0 Isotopic label 198 0.0 +Ethanedithiol@T 75.980527 76.1838 H(4)C(2)O(-1)S(2) 0.0 Chemical derivative 200 0.0 +Ethanedithiol@S 75.980527 76.1838 H(4)C(2)O(-1)S(2) 0.0 Chemical derivative 200 0.0 +NEIAA:2H(5)@Y 90.084148 90.1353 H(2)2H(5)C(4)N(1)O(1) 0.0 Isotopic label 212 0.0 +NEIAA:2H(5)@C 90.084148 90.1353 H(2)2H(5)C(4)N(1)O(1) 0.0 Isotopic label 212 0.0 +Delta:H(6)C(6)O(1)@K 94.041865 94.1112 H(6)C(6)O(1) 0.0 Other 205 0.0 +Delta:H(4)C(3)O(1)@K 56.026215 56.0633 H(4)C(3)O(1) 0.0 Other 206 0.0 +Delta:H(4)C(3)O(1)@H 56.026215 56.0633 H(4)C(3)O(1) 0.0 Other 206 0.0 +Delta:H(4)C(3)O(1)@C 56.026215 56.0633 H(4)C(3)O(1) 0.0 Other 206 0.0 +Delta:H(4)C(3)O(1)@R 56.026215 56.0633 H(4)C(3)O(1) 0.0 Artefact 206 0.0 +Delta:H(2)C(3)@K 38.01565 38.048 H(2)C(3) 0.0 Other 207 0.0 +Delta:H(4)C(6)@K 76.0313 76.096 H(4)C(6) 0.0 Other 208 0.0 +Delta:H(8)C(6)O(2)@K 112.05243 112.1265 H(8)C(6)O(2) 0.0 Other 209 0.0 +ADP-Ribosyl@D 541.06111 541.3005 H(21)C(15)N(5)O(13)P(2) 0.0 Other glycosylation 213 0.0 +ADP-Ribosyl@K 541.06111 541.3005 H(21)C(15)N(5)O(13)P(2) 0.0 Other glycosylation 213 0.0 +ADP-Ribosyl@E 541.06111 541.3005 H(21)C(15)N(5)O(13)P(2) 0.0 Other glycosylation 213 0.0 +ADP-Ribosyl@T 541.06111 541.3005 H(21)C(15)N(5)O(13)P(2) 541.06111 H(21)C(15)N(5)O(13)P(2) O-linked glycosylation 213 0.5 +ADP-Ribosyl@S 541.06111 541.3005 H(21)C(15)N(5)O(13)P(2) 541.06111 H(21)C(15)N(5)O(13)P(2) O-linked glycosylation 213 0.5 +ADP-Ribosyl@C 541.06111 541.3005 H(21)C(15)N(5)O(13)P(2) 0.0 Other glycosylation 213 0.0 +ADP-Ribosyl@N 541.06111 541.3005 H(21)C(15)N(5)O(13)P(2) 541.06111 H(21)C(15)N(5)O(13)P(2) N-linked glycosylation 213 0.5 +ADP-Ribosyl@R 541.06111 541.3005 H(21)C(15)N(5)O(13)P(2) 0.0 Other glycosylation 213 0.0 +NEIAA@Y 85.052764 85.1045 H(7)C(4)N(1)O(1) 0.0 Isotopic label 211 0.0 +NEIAA@C 85.052764 85.1045 H(7)C(4)N(1)O(1) 0.0 Isotopic label 211 0.0 +iTRAQ4plex@C 144.102063 144.1544 H(12)C(4)13C(3)N(1)15N(1)O(1) 0.0 Isotopic label 214 0.0 +iTRAQ4plex@T 144.102063 144.1544 H(12)C(4)13C(3)N(1)15N(1)O(1) 0.0 Isotopic label 214 0.0 +iTRAQ4plex@Protein_N-term 144.102063 144.1544 H(12)C(4)13C(3)N(1)15N(1)O(1) 0.0 Isotopic label 214 0.0 +iTRAQ4plex@S 144.102063 144.1544 H(12)C(4)13C(3)N(1)15N(1)O(1) 0.0 Isotopic label 214 0.0 +iTRAQ4plex@H 144.102063 144.1544 H(12)C(4)13C(3)N(1)15N(1)O(1) 0.0 Isotopic label 214 0.0 +iTRAQ4plex@Y 144.102063 144.1544 H(12)C(4)13C(3)N(1)15N(1)O(1) 0.0 Isotopic label 214 0.0 +iTRAQ4plex@Any_N-term 144.102063 144.1544 H(12)C(4)13C(3)N(1)15N(1)O(1) 0.0 Isotopic label 214 0.0 +iTRAQ4plex@K 144.102063 144.1544 H(12)C(4)13C(3)N(1)15N(1)O(1) 0.0 Isotopic label 214 0.0 +Crotonaldehyde@K 70.041865 70.0898 H(6)C(4)O(1) 0.0 Other 253 0.0 +Crotonaldehyde@H 70.041865 70.0898 H(6)C(4)O(1) 0.0 Other 253 0.0 +Crotonaldehyde@C 70.041865 70.0898 H(6)C(4)O(1) 0.0 Other 253 0.0 +Bromo@F 77.910511 78.8961 H(-1)Br(1) 0.0 Post-translational 340 0.0 +Bromo@H 77.910511 78.8961 H(-1)Br(1) 0.0 Post-translational 340 0.0 +Bromo@W 77.910511 78.8961 H(-1)Br(1) 0.0 Post-translational 340 0.0 +Bromo@Y 77.910511 78.8961 H(-1)Br(1) 0.0 Artefact 340 0.0 +Amino@Y 15.010899 15.0146 H(1)N(1) 0.0 Chemical derivative 342 0.0 +Argbiotinhydrazide@R 199.066699 199.27 H(13)C(9)N(1)O(2)S(1) 0.0 Chemical derivative 343 0.0 +Label:18O(1)@Y 2.004246 1.9998 O(-1)18O(1) 0.0 Isotopic label 258 0.0 +Label:18O(1)@T 2.004246 1.9998 O(-1)18O(1) 0.0 Isotopic label 258 0.0 +Label:18O(1)@S 2.004246 1.9998 O(-1)18O(1) 0.0 Isotopic label 258 0.0 +Label:18O(1)@Any_C-term 2.004246 1.9998 O(-1)18O(1) 0.0 Isotopic label 258 0.0 +Label:13C(6)15N(2)@K 8.014199 7.9427 C(-6)13C(6)N(-2)15N(2) 0.0 Isotopic label 259 0.0 +Thiophospho@Y 95.943487 96.0455 H(1)O(2)P(1)S(1) 0.0 Other 260 0.0 +Thiophospho@T 95.943487 96.0455 H(1)O(2)P(1)S(1) 0.0 Other 260 0.0 +Thiophospho@S 95.943487 96.0455 H(1)O(2)P(1)S(1) 0.0 Other 260 0.0 +SPITC@K 214.971084 215.2495 H(5)C(7)N(1)O(3)S(2) 0.0 Chemical derivative 261 0.0 +SPITC@Any_N-term 214.971084 215.2495 H(5)C(7)N(1)O(3)S(2) 0.0 Chemical derivative 261 0.0 +IGBP@C 296.016039 297.1478 H(13)C(12)N(2)O(2)Br(1) 0.0 Isotopic label 243 0.0 +Cytopiloyne@Y 362.136553 362.3738 H(22)C(19)O(7) 0.0 Chemical derivative 270 0.0 +Cytopiloyne@S 362.136553 362.3738 H(22)C(19)O(7) 0.0 Chemical derivative 270 0.0 +Cytopiloyne@R 362.136553 362.3738 H(22)C(19)O(7) 0.0 Chemical derivative 270 0.0 +Cytopiloyne@P 362.136553 362.3738 H(22)C(19)O(7) 0.0 Chemical derivative 270 0.0 +Cytopiloyne@Any_N-term 362.136553 362.3738 H(22)C(19)O(7) 0.0 Chemical derivative 270 0.0 +Cytopiloyne@K 362.136553 362.3738 H(22)C(19)O(7) 0.0 Chemical derivative 270 0.0 +Cytopiloyne@C 362.136553 362.3738 H(22)C(19)O(7) 0.0 Chemical derivative 270 0.0 +Cytopiloyne+water@Y 380.147118 380.3891 H(24)C(19)O(8) 0.0 Chemical derivative 271 0.0 +Cytopiloyne+water@T 380.147118 380.3891 H(24)C(19)O(8) 0.0 Chemical derivative 271 0.0 +Cytopiloyne+water@S 380.147118 380.3891 H(24)C(19)O(8) 0.0 Chemical derivative 271 0.0 +Cytopiloyne+water@R 380.147118 380.3891 H(24)C(19)O(8) 0.0 Chemical derivative 271 0.0 +Cytopiloyne+water@Any_N-term 380.147118 380.3891 H(24)C(19)O(8) 0.0 Chemical derivative 271 0.0 +Cytopiloyne+water@K 380.147118 380.3891 H(24)C(19)O(8) 0.0 Chemical derivative 271 0.0 +Cytopiloyne+water@C 380.147118 380.3891 H(24)C(19)O(8) 0.0 Chemical derivative 271 0.0 +Label:13C(6)15N(4)@R 10.008269 9.9296 C(-6)13C(6)N(-4)15N(4) 0.0 Isotopic label 267 0.0 +Label:13C(9)15N(1)@F 10.027228 9.9273 C(-9)13C(9)N(-1)15N(1) 0.0 Isotopic label 269 0.0 +Label:2H(3)@L 3.01883 3.0185 H(-3)2H(3) 0.0 Isotopic label 262 0.0 +Label:2H(3)@M 3.01883 3.0185 H(-3)2H(3) 0.0 Isotopic label 262 0.0 +Label:13C(5)15N(1)@M 6.013809 5.9567 C(-5)13C(5)N(-1)15N(1) 0.0 Isotopic label 268 0.0 +Label:13C(5)15N(1)@P 6.013809 5.9567 C(-5)13C(5)N(-1)15N(1) 0.0 Isotopic label 268 0.0 +Label:13C(5)15N(1)@V 6.013809 5.9567 C(-5)13C(5)N(-1)15N(1) 0.0 Isotopic label 268 0.0 +Label:13C(5)15N(1)@E 6.013809 5.9567 C(-5)13C(5)N(-1)15N(1) 0.0 Isotopic label 268 0.0 +PET@T 121.035005 121.2028 H(7)C(7)N(1)O(-1)S(1) 0.0 Chemical derivative 264 0.0 +PET@S 121.035005 121.2028 H(7)C(7)N(1)O(-1)S(1) 0.0 Chemical derivative 264 0.0 +CAF@Any_N-term 135.983029 136.1265 H(4)C(3)O(4)S(1) 0.0 Chemical derivative 272 0.0 +Xlink:BS2G[96]@Protein_N-term 96.021129 96.0841 H(4)C(5)O(2) 0.0 Chemical derivative 1905 0.0 +Xlink:BS2G[96]@K 96.021129 96.0841 H(4)C(5)O(2) 0.0 Chemical derivative 1905 0.0 +Nitrosyl@C 28.990164 28.9982 H(-1)N(1)O(1) 0.0 Post-translational 275 0.0 +Nitrosyl@Y 28.990164 28.9982 H(-1)N(1)O(1) 0.0 Chemical derivative 275 0.0 +Ser/Thr-KDO@T 220.058303 220.1767 H(12)C(8)O(7) 220.058303 H(12)C(8)O(7) O-linked glycosylation 2022 0.5 +Ser/Thr-KDO@S 220.058303 220.1767 H(12)C(8)O(7) 220.058303 H(12)C(8)O(7) O-linked glycosylation 2022 0.5 +AEBS@Y 183.035399 183.2276 H(9)C(8)N(1)O(2)S(1) 0.0 Artefact 276 0.0 +AEBS@S 183.035399 183.2276 H(9)C(8)N(1)O(2)S(1) 0.0 Artefact 276 0.0 +AEBS@Protein_N-term 183.035399 183.2276 H(9)C(8)N(1)O(2)S(1) 0.0 Artefact 276 0.0 +AEBS@K 183.035399 183.2276 H(9)C(8)N(1)O(2)S(1) 0.0 Artefact 276 0.0 +AEBS@H 183.035399 183.2276 H(9)C(8)N(1)O(2)S(1) 0.0 Artefact 276 0.0 +Ethanolyl@K 44.026215 44.0526 H(4)C(2)O(1) 0.0 Chemical derivative 278 0.0 +Ethanolyl@C 44.026215 44.0526 H(4)C(2)O(1) 0.0 Chemical derivative 278 0.0 +Ethanolyl@R 44.026215 44.0526 H(4)C(2)O(1) 0.0 Chemical derivative 278 0.0 +Label:13C(6)15N(2)+Dimethyl@K 36.045499 35.9959 H(4)C(-4)13C(6)N(-2)15N(2) 0.0 Isotopic label 987 0.0 +HMVK@C 86.036779 86.0892 H(6)C(4)O(2) 0.0 Chemical derivative 371 0.0 +Ethyl@Any_C-term 28.0313 28.0532 H(4)C(2) 0.0 Chemical derivative 280 OCC 0.0 +Ethyl@Protein_N-term 28.0313 28.0532 H(4)C(2) 0.0 Chemical derivative 280 0.0 +Ethyl@E 28.0313 28.0532 H(4)C(2) 0.0 Artefact 280 0.0 +Ethyl@Any_N-term 28.0313 28.0532 H(4)C(2) 0.0 Multiple 280 0.0 +Ethyl@K 28.0313 28.0532 H(4)C(2) 0.0 Multiple 280 0.0 +Ethyl@D 28.0313 28.0532 H(4)C(2) 0.0 Chemical derivative 280 0.0 +CoenzymeA@C 765.09956 765.5182 H(34)C(21)N(7)O(16)P(3)S(1) 0.0 Post-translational 281 0.0 +Methyl+Deamidated@Q 14.999666 15.0113 H(1)C(1)N(-1)O(1) 0.0 Post-translational 528 0.0 +Methyl+Deamidated@N 14.999666 15.0113 H(1)C(1)N(-1)O(1) 0.0 Chemical derivative 528 0.0 +Delta:H(5)C(2)@P 29.039125 29.0611 H(5)C(2) 0.0 Post-translational 529 0.0 +Methyl:2H(2)@K 16.028204 16.0389 2H(2)C(1) 0.0 Isotopic label 284 0.0 +Methyl:2H(2)@Any_N-term 16.028204 16.0389 2H(2)C(1) 0.0 Isotopic label 284 0.0 +SulfanilicAcid@E 155.004099 155.1744 H(5)C(6)N(1)O(2)S(1) 0.0 Isotopic label 285 0.0 +SulfanilicAcid@D 155.004099 155.1744 H(5)C(6)N(1)O(2)S(1) 0.0 Isotopic label 285 0.0 +SulfanilicAcid@Any_C-term 155.004099 155.1744 H(5)C(6)N(1)O(2)S(1) 0.0 Isotopic label 285 0.0 +SulfanilicAcid:13C(6)@E 161.024228 161.1303 H(5)13C(6)N(1)O(2)S(1) 0.0 Chemical derivative 286 0.0 +SulfanilicAcid:13C(6)@D 161.024228 161.1303 H(5)13C(6)N(1)O(2)S(1) 0.0 Chemical derivative 286 0.0 +SulfanilicAcid:13C(6)@Any_C-term 161.024228 161.1303 H(5)13C(6)N(1)O(2)S(1) 0.0 Chemical derivative 286 0.0 +Biotin-PEO-Amine@D 356.188212 356.4835 H(28)C(16)N(4)O(3)S(1) 0.0 Chemical derivative 289 0.0 +Biotin-PEO-Amine@Protein_C-term 356.188212 356.4835 H(28)C(16)N(4)O(3)S(1) 0.0 Chemical derivative 289 0.0 +Biotin-PEO-Amine@E 356.188212 356.4835 H(28)C(16)N(4)O(3)S(1) 0.0 Chemical derivative 289 0.0 +Trp->Oxolactone@W 13.979265 13.9835 H(-2)O(1) 0.0 Chemical derivative 288 0.0 +Biotin-HPDP@C 428.191582 428.6124 H(32)C(19)N(4)O(3)S(2) 0.0 Chemical derivative 290 0.0 +Delta:Hg(1)@C 201.970617 200.59 Hg(1) 0.0 Chemical derivative 291 0.0 +IodoU-AMP@Y 322.020217 322.1654 H(11)C(9)N(2)O(9)P(1) 0.0 Chemical derivative 292 0.0 +IodoU-AMP@W 322.020217 322.1654 H(11)C(9)N(2)O(9)P(1) 0.0 Chemical derivative 292 0.0 +IodoU-AMP@F 322.020217 322.1654 H(11)C(9)N(2)O(9)P(1) 0.0 Chemical derivative 292 0.0 +CAMthiopropanoyl@Protein_N-term 145.019749 145.1796 H(7)C(5)N(1)O(2)S(1) 0.0 Chemical derivative 293 0.0 +CAMthiopropanoyl@K 145.019749 145.1796 H(7)C(5)N(1)O(2)S(1) 0.0 Chemical derivative 293 0.0 +IED-Biotin@C 326.141261 326.4145 H(22)C(14)N(4)O(3)S(1) 0.0 Chemical derivative 294 0.0 +dHex@N 146.057909 146.1412 H(10)C(6)O(4) 146.057909 H(10)C(6)O(4) N-linked glycosylation 295 0.5 +dHex@T 146.057909 146.1412 H(10)C(6)O(4) 146.057909 H(10)C(6)O(4) O-linked glycosylation 295 0.5 +dHex@S 146.057909 146.1412 H(10)C(6)O(4) 146.057909 H(10)C(6)O(4) O-linked glycosylation 295 0.5 +Methyl:2H(3)@Anywhere 17.03448 17.0451 H(-1)2H(3)C(1) 0.0 Isotopic label 298 0.0 +Methyl:2H(3)@D 17.03448 17.0451 H(-1)2H(3)C(1) 0.0 Isotopic label 298 0.0 +Methyl:2H(3)@E 17.03448 17.0451 H(-1)2H(3)C(1) 0.0 Isotopic label 298 0.0 +Methyl:2H(3)@K 17.03448 17.0451 H(-1)2H(3)C(1) 0.0 Isotopic label 298 0.0 +Methyl:2H(3)@R 17.03448 17.0451 H(-1)2H(3)C(1) 0.0 Isotopic label 298 0.0 +Carboxy@E 43.989829 44.0095 C(1)O(2) 0.0 Post-translational 299 0.0 +Carboxy@D 43.989829 44.0095 C(1)O(2) 0.0 Post-translational 299 0.0 +Carboxy@K 43.989829 44.0095 C(1)O(2) 0.0 Post-translational 299 0.0 +Carboxy@W 43.989829 44.0095 C(1)O(2) 0.0 Chemical derivative 299 0.0 +Carboxy@M^Protein_N-term 43.989829 44.0095 C(1)O(2) 0.0 Post-translational 299 0.0 +Bromobimane@C 190.074228 190.1986 H(10)C(10)N(2)O(2) 0.0 Chemical derivative 301 0.0 +Menadione@K 170.036779 170.1641 H(6)C(11)O(2) 0.0 Chemical derivative 302 0.0 +Menadione@C 170.036779 170.1641 H(6)C(11)O(2) 0.0 Chemical derivative 302 0.0 +DeStreak@C 75.998285 76.1176 H(4)C(2)O(1)S(1) 0.0 Chemical derivative 303 0.0 +dHex(1)Hex(3)HexNAc(4)@T 1444.53387 1445.3331 H(92)C(56)N(4)O(39) 1444.53387 H(92)C(56)N(4)O(39) O-linked glycosylation 305 0.5 +dHex(1)Hex(3)HexNAc(4)@S 1444.53387 1445.3331 H(92)C(56)N(4)O(39) 1444.53387 H(92)C(56)N(4)O(39) O-linked glycosylation 305 0.5 +dHex(1)Hex(3)HexNAc(4)@N 1444.53387 1445.3331 H(92)C(56)N(4)O(39) 1444.53387 H(92)C(56)N(4)O(39) N-linked glycosylation 305 0.5 +dHex(1)Hex(4)HexNAc(4)@T 1606.586693 1607.4737 H(102)C(62)N(4)O(44) 1606.586693 H(102)C(62)N(4)O(44) O-linked glycosylation 307 0.5 +dHex(1)Hex(4)HexNAc(4)@S 1606.586693 1607.4737 H(102)C(62)N(4)O(44) 1606.586693 H(102)C(62)N(4)O(44) O-linked glycosylation 307 0.5 +dHex(1)Hex(4)HexNAc(4)@N 1606.586693 1607.4737 H(102)C(62)N(4)O(44) 1606.586693 H(102)C(62)N(4)O(44) N-linked glycosylation 307 0.5 +Pro+O(2)@H 129.042593 129.114 H(7)C(5)N(1)O(3) 0.0 Post-translational 2035 0.0 +dHex(1)Hex(5)HexNAc(4)@N 1768.639517 1769.6143 H(112)C(68)N(4)O(49) 1768.639517 H(112)C(68)N(4)O(49) N-linked glycosylation 308 0.5 +Hex(3)HexNAc(4)@T 1298.475961 1299.1919 H(82)C(50)N(4)O(35) 1298.475961 H(82)C(50)N(4)O(35) O-linked glycosylation 309 0.5 +Hex(3)HexNAc(4)@S 1298.475961 1299.1919 H(82)C(50)N(4)O(35) 1298.475961 H(82)C(50)N(4)O(35) O-linked glycosylation 309 0.5 +Hex(3)HexNAc(4)@N 1298.475961 1299.1919 H(82)C(50)N(4)O(35) 1298.475961 H(82)C(50)N(4)O(35) N-linked glycosylation 309 0.5 +Hex(4)HexNAc(4)@T 1460.528784 1461.3325 H(92)C(56)N(4)O(40) 1460.528784 H(92)C(56)N(4)O(40) O-linked glycosylation 310 0.5 +Hex(4)HexNAc(4)@S 1460.528784 1461.3325 H(92)C(56)N(4)O(40) 1460.528784 H(92)C(56)N(4)O(40) O-linked glycosylation 310 0.5 +Hex(4)HexNAc(4)@N 1460.528784 1461.3325 H(92)C(56)N(4)O(40) 1460.528784 H(92)C(56)N(4)O(40) N-linked glycosylation 310 0.5 +Hex(5)HexNAc(4)@T 1622.581608 1623.4731 H(102)C(62)N(4)O(45) 1622.581608 H(102)C(62)N(4)O(45) O-linked glycosylation 311 0.5 +Hex(5)HexNAc(4)@S 1622.581608 1623.4731 H(102)C(62)N(4)O(45) 1622.581608 H(102)C(62)N(4)O(45) O-linked glycosylation 311 0.5 +Hex(5)HexNAc(4)@N 1622.581608 1623.4731 H(102)C(62)N(4)O(45) 1622.581608 H(102)C(62)N(4)O(45) N-linked glycosylation 311 0.5 +Cysteinyl@C 119.004099 119.1423 H(5)C(3)N(1)O(2)S(1) 0.0 Multiple 312 0.0 +Lys-loss@K -128.094963 -128.1723 H(-12)C(-6)N(-2)O(-1) 0.0 Artefact 313 0.0 +Lys-loss@K^Protein_C-term -128.094963 -128.1723 H(-12)C(-6)N(-2)O(-1) 0.0 Post-translational 313 0.0 +Nmethylmaleimide@K 111.032028 111.0987 H(5)C(5)N(1)O(2) 0.0 Chemical derivative 314 0.0 +Nmethylmaleimide@C 111.032028 111.0987 H(5)C(5)N(1)O(2) 0.0 Chemical derivative 314 0.0 +CyDye-Cy3@C 672.298156 672.8335 H(44)C(37)N(4)O(6)S(1) 0.0 Chemical derivative 494 0.0 +DimethylpyrroleAdduct@K 78.04695 78.1118 H(6)C(6) 0.0 Chemical derivative 316 0.0 +Delta:H(2)C(5)@K 62.01565 62.0694 H(2)C(5) 0.0 Chemical derivative 318 0.0 +Delta:H(2)C(3)O(1)@K 54.010565 54.0474 H(2)C(3)O(1) 0.0 Chemical derivative 319 0.0 +Delta:H(2)C(3)O(1)@R 54.010565 54.0474 H(2)C(3)O(1) 0.0 Chemical derivative 319 0.0 +Nethylmaleimide+water@K 143.058243 143.1406 H(9)C(6)N(1)O(3) 0.0 Chemical derivative 320 0.0 +Nethylmaleimide+water@C 143.058243 143.1406 H(9)C(6)N(1)O(3) 0.0 Chemical derivative 320 0.0 +Methyl+Acetyl:2H(3)@K 59.045045 59.0817 H(1)2H(3)C(3)O(1) 0.0 Isotopic label 768 0.0 +Xlink:B10621@C 713.093079 713.5626 H(30)C(31)N(4)O(6)S(1)I(1) 0.0 Chemical derivative 323 0.0 +Xlink:DTBP[87]@Protein_N-term 87.01427 87.1435 H(5)C(3)N(1)S(1) 0.0 Chemical derivative 324 0.0 +Xlink:DTBP[87]@K 87.01427 87.1435 H(5)C(3)N(1)S(1) 0.0 Chemical derivative 324 0.0 +FP-Biotin@K 572.316129 572.7405 H(49)C(27)N(4)O(5)P(1)S(1) 0.0 Chemical derivative 325 0.0 +FP-Biotin@T 572.316129 572.7405 H(49)C(27)N(4)O(5)P(1)S(1) 0.0 Chemical derivative 325 0.0 +FP-Biotin@Y 572.316129 572.7405 H(49)C(27)N(4)O(5)P(1)S(1) 0.0 Chemical derivative 325 0.0 +FP-Biotin@S 572.316129 572.7405 H(49)C(27)N(4)O(5)P(1)S(1) 0.0 Chemical derivative 325 0.0 +Thiophos-S-S-biotin@Y 525.142894 525.6658 H(34)C(19)N(4)O(5)P(1)S(3) 525.142894 H(34)C(19)N(4)O(5)P(1)S(3) Chemical derivative 332 0.5 +Thiophos-S-S-biotin@T 525.142894 525.6658 H(34)C(19)N(4)O(5)P(1)S(3) 525.142894 H(34)C(19)N(4)O(5)P(1)S(3) Chemical derivative 332 0.5 +Thiophos-S-S-biotin@S 525.142894 525.6658 H(34)C(19)N(4)O(5)P(1)S(3) 525.142894 H(34)C(19)N(4)O(5)P(1)S(3) Chemical derivative 332 0.5 +Can-FP-biotin@T 447.195679 447.5291 H(34)C(19)N(3)O(5)P(1)S(1) 0.0 Chemical derivative 333 0.0 +Can-FP-biotin@Y 447.195679 447.5291 H(34)C(19)N(3)O(5)P(1)S(1) 0.0 Chemical derivative 333 0.0 +Can-FP-biotin@S 447.195679 447.5291 H(34)C(19)N(3)O(5)P(1)S(1) 0.0 Chemical derivative 333 0.0 +HNE+Delta:H(2)@K 158.13068 158.238 H(18)C(9)O(2) 0.0 Chemical derivative 335 0.0 +HNE+Delta:H(2)@H 158.13068 158.238 H(18)C(9)O(2) 0.0 Chemical derivative 335 0.0 +HNE+Delta:H(2)@C 158.13068 158.238 H(18)C(9)O(2) 0.0 Chemical derivative 335 0.0 +Thrbiotinhydrazide@T 240.104482 240.3252 H(16)C(10)N(4)O(1)S(1) 0.0 Chemical derivative 361 0.0 +Methylamine@T 13.031634 13.0418 H(3)C(1)N(1)O(-1) 0.0 Artefact 337 0.0 +Methylamine@S 13.031634 13.0418 H(3)C(1)N(1)O(-1) 0.0 Artefact 337 0.0 +Diisopropylphosphate@K 164.060231 164.1394 H(13)C(6)O(3)P(1) 0.0 Chemical derivative 362 0.0 +Diisopropylphosphate@Y 164.060231 164.1394 H(13)C(6)O(3)P(1) 0.0 Chemical derivative 362 0.0 +Diisopropylphosphate@T 164.060231 164.1394 H(13)C(6)O(3)P(1) 0.0 Chemical derivative 362 0.0 +Diisopropylphosphate@S 164.060231 164.1394 H(13)C(6)O(3)P(1) 0.0 Chemical derivative 362 0.0 +Diisopropylphosphate@Any_N-term 164.060231 164.1394 H(13)C(6)O(3)P(1) 0.0 Chemical derivative 362 0.0 +Isopropylphospho@Y 122.013281 122.0596 H(7)C(3)O(3)P(1) 0.0 Chemical derivative 363 0.0 +Isopropylphospho@T 122.013281 122.0596 H(7)C(3)O(3)P(1) 0.0 Chemical derivative 363 0.0 +Isopropylphospho@S 122.013281 122.0596 H(7)C(3)O(3)P(1) 0.0 Chemical derivative 363 0.0 +ICPL:13C(6)@Any_N-term 111.041593 111.05 H(3)13C(6)N(1)O(1) 0.0 Isotopic label 364 0.0 +ICPL:13C(6)@Protein_N-term 111.041593 111.05 H(3)13C(6)N(1)O(1) 0.0 Isotopic label 364 0.0 +ICPL:13C(6)@K 111.041593 111.05 H(3)13C(6)N(1)O(1) 0.0 Isotopic label 364 0.0 +CarbamidomethylDTT@C 209.018035 209.2864 H(11)C(6)N(1)O(3)S(2) 0.0 Artefact 893 0.0 +ICPL@Protein_N-term 105.021464 105.0941 H(3)C(6)N(1)O(1) 0.0 Isotopic label 365 0.0 +ICPL@K 105.021464 105.0941 H(3)C(6)N(1)O(1) 0.0 Isotopic label 365 0.0 +ICPL@Any_N-term 105.021464 105.0941 H(3)C(6)N(1)O(1) 0.0 Isotopic label 365 0.0 +Deamidated:18O(1)@Q 2.988261 2.9845 H(-1)N(-1)18O(1) 0.0 Isotopic label 366 0.0 +Deamidated:18O(1)@N 2.988261 2.9845 H(-1)N(-1)18O(1) 0.0 Isotopic label 366 0.0 +Arg->Orn@R -42.021798 -42.04 H(-2)C(-1)N(-2) 0.0 Artefact 372 0.0 +Cation:Cu[I]@Any_C-term 61.921774 62.5381 H(-1)Cu(1) 0.0 Artefact 531 O[Cu] 0.0 +Cation:Cu[I]@E 61.921774 62.5381 H(-1)Cu(1) 0.0 Artefact 531 0.0 +Cation:Cu[I]@D 61.921774 62.5381 H(-1)Cu(1) 0.0 Artefact 531 0.0 +Cation:Cu[I]@H 61.921774 62.5381 H(-1)Cu(1) 0.0 Artefact 531 0.0 +Dehydro@C -1.007825 -1.0079 H(-1) 0.0 Multiple 374 0.0 +Diphthamide@H 142.110613 142.1989 H(14)C(7)N(2)O(1) 0.0 Post-translational 375 0.0 +Hydroxyfarnesyl@C 220.182715 220.3505 H(24)C(15)O(1) 0.0 Post-translational 376 0.0 +Diacylglycerol@C 576.511761 576.9334 H(68)C(37)O(4) 0.0 Post-translational 377 0.0 +Carboxyethyl@H 72.021129 72.0627 H(4)C(3)O(2) 0.0 Chemical derivative 378 0.0 +Carboxyethyl@K 72.021129 72.0627 H(4)C(3)O(2) 0.0 Post-translational 378 0.0 +Hypusine@K 87.068414 87.1204 H(9)C(4)N(1)O(1) 0.0 Post-translational 379 0.0 +Retinylidene@K 266.203451 266.4204 H(26)C(20) 0.0 Post-translational 380 0.0 +Lys->AminoadipicAcid@K 14.96328 14.9683 H(-3)N(-1)O(2) 0.0 Post-translational 381 0.0 +Cys->PyruvicAcid@C^Protein_N-term -33.003705 -33.0961 H(-3)N(-1)O(1)S(-1) 0.0 Post-translational 382 0.0 +Ammonia-loss@C^Any_N-term -17.026549 -17.0305 H(-3)N(-1) 0.0 Artefact 385 0.0 +Ammonia-loss@S^Protein_N-term -17.026549 -17.0305 H(-3)N(-1) 0.0 Post-translational 385 0.0 +Ammonia-loss@T^Protein_N-term -17.026549 -17.0305 H(-3)N(-1) 0.0 Post-translational 385 0.0 +Ammonia-loss@N -17.026549 -17.0305 H(-3)N(-1) 0.0 Chemical derivative 385 0.0 +Phycocyanobilin@C 586.279135 586.678 H(38)C(33)N(4)O(6) 0.0 Post-translational 387 0.0 +Phycoerythrobilin@C 588.294785 588.6939 H(40)C(33)N(4)O(6) 0.0 Post-translational 388 0.0 +Phytochromobilin@C 584.263485 584.6621 H(36)C(33)N(4)O(6) 0.0 Post-translational 389 0.0 +Heme@H 616.177295 616.4873 H(32)C(34)N(4)O(4)Fe(1) 0.0 Post-translational 390 0.0 +Heme@C 616.177295 616.4873 H(32)C(34)N(4)O(4)Fe(1) 0.0 Post-translational 390 0.0 +Molybdopterin@C 521.884073 520.2668 H(11)C(10)N(5)O(8)P(1)S(2)Mo(1) 0.0 Post-translational 391 0.0 +Quinone@W 29.974179 29.9829 H(-2)O(2) 0.0 Post-translational 392 0.0 +Quinone@Y 29.974179 29.9829 H(-2)O(2) 0.0 Post-translational 392 0.0 +Glucosylgalactosyl@K 340.100562 340.2806 H(20)C(12)O(11) 340.100562 H(20)C(12)O(11) Other glycosylation 393 0.5 +GPIanchor@Protein_C-term 123.00853 123.0477 H(6)C(2)N(1)O(3)P(1) 0.0 Post-translational 394 0.0 +PhosphoribosyldephosphoCoA@S 881.146904 881.6335 H(42)C(26)N(7)O(19)P(3)S(1) 0.0 Post-translational 395 0.0 +GlycerylPE@E 197.04531 197.1262 H(12)C(5)N(1)O(5)P(1) 0.0 Post-translational 396 0.0 +Triiodothyronine@Y 469.716159 469.785 H(1)C(6)O(1)I(3) 0.0 Post-translational 397 0.0 +Thyroxine@Y 595.612807 595.6815 C(6)O(1)I(4) 0.0 Post-translational 398 0.0 +Tyr->Dha@Y -94.041865 -94.1112 H(-6)C(-6)O(-1) 0.0 Post-translational 400 0.0 +Didehydro@S -2.01565 -2.0159 H(-2) 0.0 Post-translational 401 0.0 +Didehydro@Y -2.01565 -2.0159 H(-2) 0.0 Post-translational 401 0.0 +Didehydro@T -2.01565 -2.0159 H(-2) 0.0 Chemical derivative 401 0.0 +Didehydro@K^Any_C-term -2.01565 -2.0159 H(-2) 0.0 Artefact 401 0.0 +Cys->Oxoalanine@C -17.992806 -18.0815 H(-2)O(1)S(-1) 0.0 Post-translational 402 0.0 +Ser->LacticAcid@S^Protein_N-term -15.010899 -15.0146 H(-1)N(-1) 0.0 Post-translational 403 0.0 +GluGlu@E 258.085186 258.228 H(14)C(10)N(2)O(6) 0.0 Post-translational 451 0.0 +GluGlu@Protein_C-term 258.085186 258.228 H(14)C(10)N(2)O(6) 0.0 Post-translational 451 0.0 +Phosphoadenosine@S 329.05252 329.2059 H(12)C(10)N(5)O(6)P(1) 347.063085 H(14)C(10)N(5)O(7)P(1) Post-translational 405 0.5 +Phosphoadenosine@H 329.05252 329.2059 H(12)C(10)N(5)O(6)P(1) 0.0 Post-translational 405 0.0 +Phosphoadenosine@T 329.05252 329.2059 H(12)C(10)N(5)O(6)P(1) 347.063085 H(14)C(10)N(5)O(7)P(1) Post-translational 405 0.5 +Phosphoadenosine@Y 329.05252 329.2059 H(12)C(10)N(5)O(6)P(1) 135.054495 H(5)C(5)N(5) Post-translational 405 0.5 +Phosphoadenosine@K 329.05252 329.2059 H(12)C(10)N(5)O(6)P(1) 0.0 Post-translational 405 0.0 +Glu@E 129.042593 129.114 H(7)C(5)N(1)O(3) 0.0 Post-translational 450 0.0 +Glu@Protein_C-term 129.042593 129.114 H(7)C(5)N(1)O(3) 0.0 Chemical derivative 450 0.0 +Hydroxycinnamyl@C 146.036779 146.1427 H(6)C(9)O(2) 0.0 Post-translational 407 0.0 +Glycosyl@P 148.037173 148.114 H(8)C(5)O(5) 0.0 Other glycosylation 408 0.0 +FMNH@H 454.088965 454.3279 H(19)C(17)N(4)O(9)P(1) 0.0 Post-translational 409 0.0 +FMNH@C 454.088965 454.3279 H(19)C(17)N(4)O(9)P(1) 0.0 Post-translational 409 0.0 +Archaeol@C 634.662782 635.1417 H(86)C(43)O(2) 0.0 Post-translational 410 0.0 +Phenylisocyanate@Any_N-term 119.037114 119.1207 H(5)C(7)N(1)O(1) 0.0 Chemical derivative 411 0.0 +Phenylisocyanate:2H(5)@Any_N-term 124.068498 124.1515 2H(5)C(7)N(1)O(1) 0.0 Chemical derivative 412 0.0 +Phosphoguanosine@H 345.047435 345.2053 H(12)C(10)N(5)O(7)P(1) 0.0 Post-translational 413 0.0 +Phosphoguanosine@K 345.047435 345.2053 H(12)C(10)N(5)O(7)P(1) 0.0 Post-translational 413 0.0 +Hydroxymethyl@N 30.010565 30.026 H(2)C(1)O(1) 0.0 Post-translational 414 0.0 +MolybdopterinGD+Delta:S(-1)Se(1)@C 1620.930224 1618.9096 H(47)C(40)N(20)O(26)P(4)S(3)Se(1)Mo(1) 0.0 Post-translational 415 0.0 +Dipyrrolylmethanemethyl@C 418.137616 418.3973 H(22)C(20)N(2)O(8) 0.0 Post-translational 416 0.0 +PhosphoUridine@H 306.025302 306.166 H(11)C(9)N(2)O(8)P(1) 0.0 Post-translational 417 0.0 +PhosphoUridine@Y 306.025302 306.166 H(11)C(9)N(2)O(8)P(1) 0.0 Post-translational 417 0.0 +Glycerophospho@S 154.00311 154.0584 H(7)C(3)O(5)P(1) 0.0 Post-translational 419 0.0 +Carboxy->Thiocarboxy@G^Protein_C-term 15.977156 16.0656 O(-1)S(1) 0.0 Post-translational 420 0.0 +Sulfide@D 31.972071 32.065 S(1) 0.0 Post-translational 421 0.0 +Sulfide@C 31.972071 32.065 S(1) 0.0 Post-translational 421 0.0 +Sulfide@W 31.972071 32.065 S(1) 0.0 Chemical derivative 421 0.0 +PyruvicAcidIminyl@K 70.005479 70.0468 H(2)C(3)O(2) 0.0 Post-translational 422 0.0 +PyruvicAcidIminyl@V^Protein_N-term 70.005479 70.0468 H(2)C(3)O(2) 0.0 Post-translational 422 0.0 +PyruvicAcidIminyl@C^Protein_N-term 70.005479 70.0468 H(2)C(3)O(2) 0.0 Post-translational 422 0.0 +Delta:Se(1)@C 79.91652 78.96 Se(1) 0.0 Post-translational 423 0.0 +MolybdopterinGD@D 1572.985775 1572.0146 H(47)C(40)N(20)O(26)P(4)S(4)Mo(1) 0.0 Post-translational 424 0.0 +MolybdopterinGD@C 1572.985775 1572.0146 H(47)C(40)N(20)O(26)P(4)S(4)Mo(1) 0.0 Post-translational 424 0.0 +MolybdopterinGD@U 1572.985775 1572.0146 H(47)C(40)N(20)O(26)P(4)S(4)Mo(1) 0.0 Post-translational 424 0.0 +Dioxidation@U 31.989829 31.9988 O(2) 0.0 Multiple 425 0.0 +Dioxidation@C 31.989829 31.9988 O(2) 0.0 Post-translational 425 0.0 +Dioxidation@W 31.989829 31.9988 O(2) 0.0 Chemical derivative 425 0.0 +Dioxidation@Y 31.989829 31.9988 O(2) 0.0 Post-translational 425 0.0 +Dioxidation@F 31.989829 31.9988 O(2) 0.0 Chemical derivative 425 0.0 +Dioxidation@M 31.989829 31.9988 O(2) 0.0 Post-translational 425 0.0 +Dioxidation@K 31.989829 31.9988 O(2) 0.0 Post-translational 425 0.0 +Dioxidation@R 31.989829 31.9988 O(2) 0.0 Post-translational 425 0.0 +Dioxidation@P 31.989829 31.9988 O(2) 0.0 Post-translational 425 0.0 +Dioxidation@E 31.989829 31.9988 O(2) 0.0 Chemical derivative 425 0.0 +Dioxidation@I 31.989829 31.9988 O(2) 0.0 Chemical derivative 425 0.0 +Dioxidation@L 31.989829 31.9988 O(2) 0.0 Chemical derivative 425 0.0 +Dioxidation@V 31.989829 31.9988 O(2) 0.0 Chemical derivative 425 0.0 +Octanoyl@T 126.104465 126.1962 H(14)C(8)O(1) 0.0 Post-translational 426 0.0 +Octanoyl@S 126.104465 126.1962 H(14)C(8)O(1) 0.0 Post-translational 426 0.0 +Octanoyl@C 126.104465 126.1962 H(14)C(8)O(1) 0.0 Post-translational 426 0.0 +PhosphoHexNAc@T 283.045704 283.1724 H(14)C(8)N(1)O(8)P(1) 283.045704 H(14)C(8)N(1)O(8)P(1) O-linked glycosylation 428 0.5 +PhosphoHexNAc@S 283.045704 283.1724 H(14)C(8)N(1)O(8)P(1) 283.045704 H(14)C(8)N(1)O(8)P(1) O-linked glycosylation 428 0.5 +PhosphoHex@T 242.019154 242.1205 H(11)C(6)O(8)P(1) 242.019154 H(11)C(6)O(8)P(1) O-linked glycosylation 429 0.5 +PhosphoHex@S 242.019154 242.1205 H(11)C(6)O(8)P(1) 242.019154 H(11)C(6)O(8)P(1) O-linked glycosylation 429 0.5 +Palmitoleyl@C 236.214016 236.3929 H(28)C(16)O(1) 0.0 Post-translational 431 0.0 +Palmitoleyl@S 236.214016 236.3929 H(28)C(16)O(1) 0.0 Post-translational 431 0.0 +Palmitoleyl@T 236.214016 236.3929 H(28)C(16)O(1) 0.0 Pre-translational 431 0.0 +Cholesterol@Protein_C-term 368.344302 368.6383 H(44)C(27) 0.0 Post-translational 432 0.0 +Didehydroretinylidene@K 264.187801 264.4046 H(24)C(20) 0.0 Post-translational 433 0.0 +CHDH@D 294.183109 294.3859 H(26)C(17)O(4) 0.0 Post-translational 434 0.0 +Methylpyrroline@K 109.052764 109.1259 H(7)C(6)N(1)O(1) 0.0 Post-translational 435 0.0 +Hydroxyheme@E 614.161645 614.4714 H(30)C(34)N(4)O(4)Fe(1) 0.0 Post-translational 436 0.0 +MicrocinC7@Protein_C-term 386.110369 386.3003 H(19)C(13)N(6)O(6)P(1) 0.0 Post-translational 437 0.0 +Cyano@C 24.995249 25.0095 H(-1)C(1)N(1) 0.0 Post-translational 438 0.0 +Diironsubcluster@C 342.786916 342.876 H(-1)C(5)N(2)O(5)S(2)Fe(2) 0.0 Post-translational 439 0.0 +Amidino@C 42.021798 42.04 H(2)C(1)N(2) 0.0 Post-translational 440 0.0 +FMN@S 438.094051 438.3285 H(19)C(17)N(4)O(8)P(1) 0.0 Post-translational 442 0.0 +FMN@T 438.094051 438.3285 H(19)C(17)N(4)O(8)P(1) 0.0 Post-translational 442 0.0 +FMNC@C 456.104615 456.3438 H(21)C(17)N(4)O(9)P(1) 0.0 Post-translational 443 0.0 +CuSMo@C 922.834855 922.067 H(24)C(19)N(8)O(15)P(2)S(3)Mo(1)Cu(1) 0.0 Post-translational 444 0.0 +Hydroxytrimethyl@K 59.04969 59.0871 H(7)C(3)O(1) 0.0 Post-translational 445 0.0 +Deoxy@T -15.994915 -15.9994 O(-1) 0.0 Chemical derivative 447 0.0 +Deoxy@D -15.994915 -15.9994 O(-1) 0.0 Post-translational 447 0.0 +Deoxy@S -15.994915 -15.9994 O(-1) 0.0 Chemical derivative 447 0.0 +Microcin@Protein_C-term 831.197041 831.6871 H(37)C(36)N(3)O(20) 0.0 Post-translational 448 0.0 +Decanoyl@T 154.135765 154.2493 H(18)C(10)O(1) 0.0 Post-translational 449 0.0 +Decanoyl@S 154.135765 154.2493 H(18)C(10)O(1) 0.0 Post-translational 449 0.0 +GluGluGlu@Protein_C-term 387.127779 387.3419 H(21)C(15)N(3)O(9) 0.0 Post-translational 452 0.0 +GluGluGlu@E 387.127779 387.3419 H(21)C(15)N(3)O(9) 0.0 Post-translational 452 0.0 +GluGluGluGlu@Protein_C-term 516.170373 516.4559 H(28)C(20)N(4)O(12) 0.0 Post-translational 453 0.0 +GluGluGluGlu@E 516.170373 516.4559 H(28)C(20)N(4)O(12) 0.0 Post-translational 453 0.0 +HexN@W 161.068808 161.1558 H(11)C(6)N(1)O(4) 0.0 Other glycosylation 454 0.0 +HexN@T 161.068808 161.1558 H(11)C(6)N(1)O(4) 161.068808 H(11)C(6)N(1)O(4) O-linked glycosylation 454 0.5 +HexN@S 161.068808 161.1558 H(11)C(6)N(1)O(4) 161.068808 H(11)C(6)N(1)O(4) O-linked glycosylation 454 0.5 +HexN@N 161.068808 161.1558 H(11)C(6)N(1)O(4) 161.068808 H(11)C(6)N(1)O(4) N-linked glycosylation 454 0.5 +HexN@K 161.068808 161.1558 H(11)C(6)N(1)O(4) 0.0 Synth. pep. protect. gp. 454 0.0 +Xlink:DMP[154]@Protein_N-term 154.110613 154.2096 H(14)C(8)N(2)O(1) 0.0 Chemical derivative 455 0.0 +Xlink:DMP[154]@K 154.110613 154.2096 H(14)C(8)N(2)O(1) 0.0 Chemical derivative 455 0.0 +NDA@Any_N-term 175.042199 175.1855 H(5)C(13)N(1) 0.0 Chemical derivative 457 0.0 +NDA@K 175.042199 175.1855 H(5)C(13)N(1) 0.0 Chemical derivative 457 0.0 +SPITC:13C(6)@Any_N-term 220.991213 221.2054 H(5)C(1)13C(6)N(1)O(3)S(2) 0.0 Chemical derivative 464 0.0 +SPITC:13C(6)@K 220.991213 221.2054 H(5)C(1)13C(6)N(1)O(3)S(2) 0.0 Chemical derivative 464 0.0 +TMAB:2H(9)@Any_N-term 137.16403 137.2476 H(5)2H(9)C(7)N(1)O(1) 68.12999 2H(9)C(3)N(1) Isotopic label 477 0.5 +TMAB:2H(9)@K 137.16403 137.2476 H(5)2H(9)C(7)N(1)O(1) 68.12999 2H(9)C(3)N(1) Isotopic label 477 0.5 +TMAB@Any_N-term 128.107539 128.1922 H(14)C(7)N(1)O(1) 59.073499 H(9)C(3)N(1) Isotopic label 476 0.5 +TMAB@K 128.107539 128.1922 H(14)C(7)N(1)O(1) 59.073499 H(9)C(3)N(1) Isotopic label 476 0.5 +FTC@S 421.073241 421.4259 H(15)C(21)N(3)O(5)S(1) 0.0 Chemical derivative 478 0.0 +FTC@R 421.073241 421.4259 H(15)C(21)N(3)O(5)S(1) 0.0 Chemical derivative 478 0.0 +FTC@P 421.073241 421.4259 H(15)C(21)N(3)O(5)S(1) 0.0 Chemical derivative 478 0.0 +FTC@K 421.073241 421.4259 H(15)C(21)N(3)O(5)S(1) 0.0 Chemical derivative 478 0.0 +FTC@C 421.073241 421.4259 H(15)C(21)N(3)O(5)S(1) 0.0 Chemical derivative 478 0.0 +AEC-MAEC@T 59.019355 59.1334 H(5)C(2)N(1)O(-1)S(1) 0.0 Chemical derivative 472 0.0 +AEC-MAEC@S 59.019355 59.1334 H(5)C(2)N(1)O(-1)S(1) 0.0 Chemical derivative 472 0.0 +BADGE@C 340.167459 340.4129 H(24)C(21)O(4) 0.0 Non-standard residue 493 0.0 +Label:2H(4)@A 4.025107 4.0246 H(-4)2H(4) 0.0 Isotopic label 481 0.0 +Label:2H(4)@Y 4.025107 4.0246 H(-4)2H(4) 0.0 Isotopic label 481 0.0 +Label:2H(4)@F 4.025107 4.0246 H(-4)2H(4) 0.0 Isotopic label 481 0.0 +Label:2H(4)@K 4.025107 4.0246 H(-4)2H(4) 0.0 Isotopic label 481 0.0 +Label:2H(4)@U 4.025107 4.0246 H(-4)2H(4) 0.0 Isotopic label 481 0.0 +Hep@T 192.063388 192.1666 H(12)C(7)O(6) 192.063388 H(12)C(7)O(6) O-linked glycosylation 490 0.5 +Hep@S 192.063388 192.1666 H(12)C(7)O(6) 192.063388 H(12)C(7)O(6) O-linked glycosylation 490 0.5 +Hep@R 192.063388 192.1666 H(12)C(7)O(6) 0.0 N-linked glycosylation 490 0.0 +Hep@Q 192.063388 192.1666 H(12)C(7)O(6) 0.0 Other glycosylation 490 0.0 +Hep@N 192.063388 192.1666 H(12)C(7)O(6) 192.063388 H(12)C(7)O(6) N-linked glycosylation 490 0.5 +Hep@K 192.063388 192.1666 H(12)C(7)O(6) 0.0 Other glycosylation 490 0.0 +CyDye-Cy5@C 684.298156 684.8442 H(44)C(38)N(4)O(6)S(1) 0.0 Chemical derivative 495 0.0 +DHP@C 118.065674 118.1558 H(8)C(8)N(1) 0.0 Chemical derivative 488 0.0 +BHTOH@H 234.16198 234.334 H(22)C(15)O(2) 0.0 Other 498 0.0 +BHTOH@C 234.16198 234.334 H(22)C(15)O(2) 0.0 Other 498 0.0 +BHTOH@K 234.16198 234.334 H(22)C(15)O(2) 0.0 Other 498 0.0 +IGBP:13C(2)@C 298.022748 299.1331 H(13)C(10)13C(2)N(2)O(2)Br(1) 0.0 Isotopic label 499 0.0 +Nmethylmaleimide+water@C 129.042593 129.114 H(7)C(5)N(1)O(3) 0.0 Chemical derivative 500 0.0 +PyMIC@Any_N-term 134.048013 134.1353 H(6)C(7)N(2)O(1) 0.0 Chemical derivative 501 0.0 +LG-lactam-K@Protein_N-term 332.19876 332.4339 H(28)C(20)O(4) 0.0 Post-translational 503 0.0 +LG-lactam-K@K 332.19876 332.4339 H(28)C(20)O(4) 0.0 Post-translational 503 0.0 +BisANS@K 594.091928 594.6569 H(22)C(32)N(2)O(6)S(2) 0.0 Chemical derivative 519 0.0 +Piperidine@Any_N-term 68.0626 68.117 H(8)C(5) 0.0 Chemical derivative 520 0.0 +Piperidine@K 68.0626 68.117 H(8)C(5) 0.0 Chemical derivative 520 0.0 +Diethyl@Any_N-term 56.0626 56.1063 H(8)C(4) 0.0 Chemical derivative 518 0.0 +Diethyl@K 56.0626 56.1063 H(8)C(4) 0.0 Chemical derivative 518 0.0 +LG-Hlactam-K@Protein_N-term 348.193674 348.4333 H(28)C(20)O(5) 0.0 Post-translational 504 0.0 +LG-Hlactam-K@K 348.193674 348.4333 H(28)C(20)O(5) 0.0 Post-translational 504 0.0 +Dimethyl:2H(4)13C(2)@Protein_N-term 34.063117 34.0631 2H(4)13C(2) 0.0 Isotopic label 510 [13C]([2H])([2H])([1H]) 0.0 +Dimethyl:2H(4)13C(2)@R 34.063117 34.0631 2H(4)13C(2) 0.0 Isotopic label 510 0.0 +Dimethyl:2H(4)13C(2)@K 34.063117 34.0631 2H(4)13C(2) 0.0 Isotopic label 510 0.0 +Dimethyl:2H(4)13C(2)@Any_N-term 34.063117 34.0631 2H(4)13C(2) 0.0 Isotopic label 510 [13C]([2H])([2H])([1H]) 0.0 +C8-QAT@Any_N-term 227.224915 227.3862 H(29)C(14)N(1)O(1) 0.0 Chemical derivative 513 0.0 +C8-QAT@K 227.224915 227.3862 H(29)C(14)N(1)O(1) 0.0 Chemical derivative 513 0.0 +Hex(2)@R 324.105647 324.2812 H(20)C(12)O(10) 0.0 Other glycosylation 512 0.0 +Hex(2)@K 324.105647 324.2812 H(20)C(12)O(10) 0.0 Other glycosylation 512 0.0 +Hex(2)@S 324.105647 324.2812 H(20)C(12)O(10) 324.105647 H(20)C(12)O(10) O-linked glycosylation 512 0.5 +Hex(2)@T 324.105647 324.2812 H(20)C(12)O(10) 324.105647 H(20)C(12)O(10) O-linked glycosylation 512 0.5 +LG-lactam-R@R 290.176961 290.3939 H(26)C(19)N(-2)O(4) 0.0 Post-translational 505 0.0 +Withaferin@C 470.266839 470.5977 H(38)C(28)O(6) 0.0 Chemical derivative 1036 0.0 +Biotin:Thermo-88317@S 443.291294 443.5603 H(42)C(22)N(3)O(4)P(1) 0.0 Chemical derivative 1037 0.0 +Biotin:Thermo-88317@Y 443.291294 443.5603 H(42)C(22)N(3)O(4)P(1) 0.0 Chemical derivative 1037 0.0 +CLIP_TRAQ_2@Any_N-term 141.098318 141.1756 H(12)C(6)13C(1)N(2)O(1) 0.0 Isotopic label 525 0.0 +CLIP_TRAQ_2@K 141.098318 141.1756 H(12)C(6)13C(1)N(2)O(1) 0.0 Isotopic label 525 0.0 +CLIP_TRAQ_2@Y 141.098318 141.1756 H(12)C(6)13C(1)N(2)O(1) 0.0 Isotopic label 525 0.0 +LG-Hlactam-R@R 306.171876 306.3933 H(26)C(19)N(-2)O(5) 0.0 Post-translational 506 0.0 +Maleimide-PEO2-Biotin@C 525.225719 525.6183 H(35)C(23)N(5)O(7)S(1) 0.0 Chemical derivative 522 0.0 +Sulfo-NHS-LC-LC-Biotin@Any_N-term 452.245726 452.6106 H(36)C(22)N(4)O(4)S(1) 0.0 Chemical derivative 523 0.0 +Sulfo-NHS-LC-LC-Biotin@K 452.245726 452.6106 H(36)C(22)N(4)O(4)S(1) 0.0 Chemical derivative 523 0.0 +FNEM@C 427.069202 427.3625 H(13)C(24)N(1)O(7) 0.0 Chemical derivative 515 0.0 +PropylNAGthiazoline@C 232.064354 232.2768 H(14)C(9)N(1)O(4)S(1) 0.0 Chemical derivative 514 0.0 +Dethiomethyl@M -48.003371 -48.1075 H(-4)C(-1)S(-1) 0.0 Artefact 526 0.0 +iTRAQ4plex114@Y 144.105918 144.168 H(12)C(5)13C(2)N(2)18O(1) 0.0 Isotopic label 532 0.0 +iTRAQ4plex114@Any_N-term 144.105918 144.168 H(12)C(5)13C(2)N(2)18O(1) 0.0 Isotopic label 532 0.0 +iTRAQ4plex114@K 144.105918 144.168 H(12)C(5)13C(2)N(2)18O(1) 0.0 Isotopic label 532 0.0 +iTRAQ4plex114@C 144.105918 144.168 H(12)C(5)13C(2)N(2)18O(1) 0.0 Isotopic label 532 0.0 +iTRAQ4plex115@Y 144.099599 144.1688 H(12)C(6)13C(1)N(1)15N(1)18O(1) 0.0 Isotopic label 533 0.0 +iTRAQ4plex115@Any_N-term 144.099599 144.1688 H(12)C(6)13C(1)N(1)15N(1)18O(1) 0.0 Isotopic label 533 0.0 +iTRAQ4plex115@K 144.099599 144.1688 H(12)C(6)13C(1)N(1)15N(1)18O(1) 0.0 Isotopic label 533 0.0 +iTRAQ4plex115@C 144.099599 144.1688 H(12)C(6)13C(1)N(1)15N(1)18O(1) 0.0 Isotopic label 533 0.0 +Dibromo@Y 155.821022 157.7921 H(-2)Br(2) 0.0 Chemical derivative 534 0.0 +LRGG@K 383.228103 383.446 H(29)C(16)N(7)O(4) 0.0 Chemical derivative 535 0.0 +CLIP_TRAQ_3@Y 271.148736 271.2976 H(20)C(11)13C(1)N(3)O(4) 0.0 Isotopic label 536 0.0 +CLIP_TRAQ_3@Any_N-term 271.148736 271.2976 H(20)C(11)13C(1)N(3)O(4) 0.0 Isotopic label 536 0.0 +CLIP_TRAQ_3@K 271.148736 271.2976 H(20)C(11)13C(1)N(3)O(4) 0.0 Isotopic label 536 0.0 +CLIP_TRAQ_4@Any_N-term 244.101452 244.2292 H(15)C(9)13C(1)N(2)O(5) 0.0 Isotopic label 537 0.0 +CLIP_TRAQ_4@K 244.101452 244.2292 H(15)C(9)13C(1)N(2)O(5) 0.0 Isotopic label 537 0.0 +CLIP_TRAQ_4@Y 244.101452 244.2292 H(15)C(9)13C(1)N(2)O(5) 0.0 Isotopic label 537 0.0 +Biotin:Cayman-10141@C 626.386577 626.8927 H(54)C(35)N(4)O(4)S(1) 0.0 Other 538 0.0 +Biotin:Cayman-10013@C 660.428442 660.9504 H(60)C(36)N(4)O(5)S(1) 0.0 Other 539 0.0 +Ala->Ser@A 15.994915 15.9994 H(0)C(0)N(0)O(1)S(0) 0.0 AA substitution 540 0.0 +Ala->Thr@A 30.010565 30.026 H(2)C(1)N(0)O(1)S(0) 0.0 AA substitution 541 0.0 +Ala->Asp@A 43.989829 44.0095 H(0)C(1)N(0)O(2)S(0) 0.0 AA substitution 542 0.0 +Ala->Pro@A 26.01565 26.0373 H(2)C(2)N(0)O(0)S(0) 0.0 AA substitution 543 0.0 +Ala->Gly@A -14.01565 -14.0266 H(-2)C(-1)N(0)O(0)S(0) 0.0 AA substitution 544 0.0 +Ala->Glu@A 58.005479 58.0361 H(2)C(2)N(0)O(2)S(0) 0.0 AA substitution 545 0.0 +Ala->Val@A 28.0313 28.0532 H(4)C(2)N(0)O(0)S(0) 0.0 AA substitution 546 0.0 +Cys->Phe@C 44.059229 44.031 H(4)C(6)N(0)O(0)S(-1) 0.0 AA substitution 547 0.0 +Cys->Ser@C -15.977156 -16.0656 H(0)C(0)N(0)O(1)S(-1) 0.0 AA substitution 548 0.0 +Cys->Trp@C 83.070128 83.067 H(5)C(8)N(1)O(0)S(-1) 0.0 AA substitution 549 0.0 +Cys->Tyr@C 60.054144 60.0304 H(4)C(6)N(0)O(1)S(-1) 0.0 AA substitution 550 0.0 +Cys->Arg@C 53.091927 53.0428 H(7)C(3)N(3)O(0)S(-1) 0.0 AA substitution 551 0.0 +Cys->Gly@C -45.987721 -46.0916 H(-2)C(-1)N(0)O(0)S(-1) 0.0 AA substitution 552 0.0 +Asp->Ala@D -43.989829 -44.0095 H(0)C(-1)N(0)O(-2)S(0) 0.0 AA substitution 553 0.0 +Asp->His@D 22.031969 22.0519 H(2)C(2)N(2)O(-2)S(0) 0.0 AA substitution 554 0.0 +Asp->Asn@D -0.984016 -0.9848 H(1)C(0)N(1)O(-1)S(0) 0.0 AA substitution 555 0.0 +Asp->Gly@D -58.005479 -58.0361 H(-2)C(-2)N(0)O(-2)S(0) 0.0 AA substitution 556 0.0 +Asp->Tyr@D 48.036386 48.0859 H(4)C(5)N(0)O(-1)S(0) 0.0 AA substitution 557 0.0 +Asp->Glu@D 14.01565 14.0266 H(2)C(1)N(0)O(0)S(0) 0.0 AA substitution 558 0.0 +Asp->Val@D -15.958529 -15.9563 H(4)C(1)N(0)O(-2)S(0) 0.0 AA substitution 559 0.0 +Glu->Ala@E -58.005479 -58.0361 H(-2)C(-2)N(0)O(-2)S(0) 0.0 AA substitution 560 0.0 +Glu->Gln@E -0.984016 -0.9848 H(1)C(0)N(1)O(-1)S(0) 0.0 AA substitution 561 0.0 +Glu->Asp@E -14.01565 -14.0266 H(-2)C(-1)N(0)O(0)S(0) 0.0 AA substitution 562 0.0 +Glu->Lys@E -0.94763 -0.9417 H(5)C(1)N(1)O(-2)S(0) 0.0 AA substitution 563 0.0 +Glu->Gly@E -72.021129 -72.0627 H(-4)C(-3)N(0)O(-2)S(0) 0.0 AA substitution 564 0.0 +Glu->Val@E -29.974179 -29.9829 H(2)C(0)N(0)O(-2)S(0) 0.0 AA substitution 565 0.0 +Phe->Ser@F -60.036386 -60.0966 H(-4)C(-6)N(0)O(1)S(0) 0.0 AA substitution 566 0.0 +Phe->Cys@F -44.059229 -44.031 H(-4)C(-6)N(0)O(0)S(1) 0.0 AA substitution 567 0.0 +Phe->Xle@F -33.98435 -34.0162 H(2)C(-3) 0.0 AA substitution 568 0.0 +Phe->Tyr@F 15.994915 15.9994 H(0)C(0)N(0)O(1)S(0) 0.0 AA substitution 569 0.0 +Phe->Val@F -48.0 -48.0428 H(0)C(-4)N(0)O(0)S(0) 0.0 AA substitution 570 0.0 +Gly->Ala@G 14.01565 14.0266 H(2)C(1)N(0)O(0)S(0) 0.0 AA substitution 571 0.0 +Gly->Ser@G 30.010565 30.026 H(2)C(1)N(0)O(1)S(0) 0.0 AA substitution 572 0.0 +Gly->Trp@G 129.057849 129.1586 H(7)C(9)N(1)O(0)S(0) 0.0 AA substitution 573 0.0 +Gly->Glu@G 72.021129 72.0627 H(4)C(3)N(0)O(2)S(0) 0.0 AA substitution 574 0.0 +Gly->Val@G 42.04695 42.0797 H(6)C(3)N(0)O(0)S(0) 0.0 AA substitution 575 0.0 +Gly->Asp@G 58.005479 58.0361 H(2)C(2)N(0)O(2)S(0) 0.0 AA substitution 576 0.0 +Gly->Cys@G 45.987721 46.0916 H(2)C(1)N(0)O(0)S(1) 0.0 AA substitution 577 0.0 +Gly->Arg@G 99.079647 99.1344 H(9)C(4)N(3)O(0)S(0) 0.0 AA substitution 578 0.0 +dNIC@Any_N-term 109.048119 109.1205 H(1)2H(3)C(6)N(1)O(1) 0.0 Isotopic label 698 0.0 +dNIC@K 109.048119 109.1205 H(1)2H(3)C(6)N(1)O(1) 0.0 Isotopic label 698 0.0 +His->Pro@H -40.006148 -40.0241 H(0)C(-1)N(-2)O(0)S(0) 0.0 AA substitution 580 0.0 +His->Tyr@H 26.004417 26.034 H(2)C(3)N(-2)O(1)S(0) 0.0 AA substitution 581 0.0 +His->Gln@H -9.000334 -9.0101 H(1)C(-1)N(-1)O(1)S(0) 0.0 AA substitution 582 0.0 +NIC@Any_N-term 105.021464 105.0941 H(3)C(6)N(1)O(1) 0.0 Isotopic label 697 0.0 +NIC@K 105.021464 105.0941 H(3)C(6)N(1)O(1) 0.0 Isotopic label 697 0.0 +His->Arg@H 19.042199 19.0464 H(5)C(0)N(1)O(0)S(0) 0.0 AA substitution 584 0.0 +His->Xle@H -23.974848 -23.9816 H(4)N(-2) 0.0 AA substitution 585 0.0 +Xle->Ala@L -42.04695 -42.0797 H(-6)C(-3)N(0)O(0)S(0) 0.0 AA substitution 1125 0.0 +Xle->Ala@I -42.04695 -42.0797 H(-6)C(-3)N(0)O(0)S(0) 0.0 AA substitution 1125 0.0 +Xle->Thr@L -12.036386 -12.0538 H(-4)C(-2)O(1) 0.0 AA substitution 588 0.0 +Xle->Thr@I -12.036386 -12.0538 H(-4)C(-2)O(1) 0.0 AA substitution 588 0.0 +Xle->Asn@L 0.958863 0.945 H(-5)C(-2)N(1)O(1) 0.0 AA substitution 589 0.0 +Xle->Asn@I 0.958863 0.945 H(-5)C(-2)N(1)O(1) 0.0 AA substitution 589 0.0 +Xle->Lys@L 15.010899 15.0146 H(1)N(1) 0.0 AA substitution 590 0.0 +Xle->Lys@I 15.010899 15.0146 H(1)N(1) 0.0 AA substitution 590 0.0 +Lys->Thr@K -27.047285 -27.0684 H(-5)C(-2)N(-1)O(1)S(0) 0.0 AA substitution 594 0.0 +Lys->Asn@K -14.052036 -14.0696 H(-6)C(-2)N(0)O(1)S(0) 0.0 AA substitution 595 0.0 +Lys->Glu@K 0.94763 0.9417 H(-5)C(-1)N(-1)O(2)S(0) 0.0 AA substitution 596 0.0 +Lys->Gln@K -0.036386 -0.0431 H(-4)C(-1)N(0)O(1)S(0) 0.0 AA substitution 597 0.0 +Lys->Met@K 2.945522 3.0238 H(-3)C(-1)N(-1)O(0)S(1) 0.0 AA substitution 598 0.0 +Lys->Arg@K 28.006148 28.0134 H(0)C(0)N(2)O(0)S(0) 0.0 AA substitution 599 0.0 +Lys->Xle@K -15.010899 -15.0146 H(-1)N(-1) 0.0 AA substitution 600 0.0 +Xle->Ser@I -26.052036 -26.0803 H(-6)C(-3)O(1) 0.0 AA substitution 601 0.0 +Xle->Ser@L -26.052036 -26.0803 H(-6)C(-3)O(1) 0.0 AA substitution 601 0.0 +Xle->Phe@I 33.98435 34.0162 H(-2)C(3) 0.0 AA substitution 602 0.0 +Xle->Phe@L 33.98435 34.0162 H(-2)C(3) 0.0 AA substitution 602 0.0 +Xle->Trp@I 72.995249 73.0523 H(-1)C(5)N(1) 0.0 AA substitution 603 0.0 +Xle->Trp@L 72.995249 73.0523 H(-1)C(5)N(1) 0.0 AA substitution 603 0.0 +Xle->Pro@I -16.0313 -16.0425 H(-4)C(-1) 0.0 AA substitution 604 0.0 +Xle->Pro@L -16.0313 -16.0425 H(-4)C(-1) 0.0 AA substitution 604 0.0 +Xle->Val@I -14.01565 -14.0266 H(-2)C(-1) 0.0 AA substitution 605 0.0 +Xle->Val@L -14.01565 -14.0266 H(-2)C(-1) 0.0 AA substitution 605 0.0 +Xle->His@I 23.974848 23.9816 H(-4)N(2) 0.0 AA substitution 606 0.0 +Xle->His@L 23.974848 23.9816 H(-4)N(2) 0.0 AA substitution 606 0.0 +Xle->Gln@I 14.974514 14.9716 H(-3)C(-1)N(1)O(1) 0.0 AA substitution 607 0.0 +Xle->Gln@L 14.974514 14.9716 H(-3)C(-1)N(1)O(1) 0.0 AA substitution 607 0.0 +Xle->Met@I 17.956421 18.0384 H(-2)C(-1)S(1) 0.0 AA substitution 608 0.0 +Xle->Met@L 17.956421 18.0384 H(-2)C(-1)S(1) 0.0 AA substitution 608 0.0 +Xle->Arg@I 43.017047 43.028 H(1)N(3) 0.0 AA substitution 609 0.0 +Xle->Arg@L 43.017047 43.028 H(1)N(3) 0.0 AA substitution 609 0.0 +Met->Thr@M -29.992806 -30.0922 H(-2)C(-1)N(0)O(1)S(-1) 0.0 AA substitution 610 0.0 +Met->Arg@M 25.060626 24.9896 H(3)C(1)N(3)O(0)S(-1) 0.0 AA substitution 611 0.0 +Met->Lys@M -2.945522 -3.0238 H(3)C(1)N(1)O(0)S(-1) 0.0 AA substitution 613 0.0 +Met->Xle@M -17.956421 -18.0384 H(2)C(1)S(-1) 0.0 AA substitution 614 0.0 +Met->Val@M -31.972071 -32.065 H(0)C(0)N(0)O(0)S(-1) 0.0 AA substitution 615 0.0 +Asn->Ser@N -27.010899 -27.0253 H(-1)C(-1)N(-1)O(0)S(0) 0.0 AA substitution 616 0.0 +Asn->Thr@N -12.995249 -12.9988 H(1)C(0)N(-1)O(0)S(0) 0.0 AA substitution 617 0.0 +Asn->Lys@N 14.052036 14.0696 H(6)C(2)N(0)O(-1)S(0) 0.0 AA substitution 618 0.0 +Asn->Tyr@N 49.020401 49.0706 H(3)C(5)N(-1)O(0)S(0) 0.0 AA substitution 619 0.0 +Asn->His@N 23.015984 23.0366 H(1)C(2)N(1)O(-1)S(0) 0.0 AA substitution 620 0.0 +Asn->Asp@N 0.984016 0.9848 H(-1)C(0)N(-1)O(1)S(0) 0.0 AA substitution 621 0.0 +Asn->Xle@N -0.958863 -0.945 H(5)C(2)N(-1)O(-1) 0.0 AA substitution 622 0.0 +Pro->Ser@P -10.020735 -10.0379 H(-2)C(-2)N(0)O(1)S(0) 0.0 AA substitution 623 0.0 +Pro->Ala@P -26.01565 -26.0373 H(-2)C(-2)N(0)O(0)S(0) 0.0 AA substitution 624 0.0 +Pro->His@P 40.006148 40.0241 H(0)C(1)N(2)O(0)S(0) 0.0 AA substitution 625 0.0 +Pro->Gln@P 31.005814 31.014 H(1)C(0)N(1)O(1)S(0) 0.0 AA substitution 626 0.0 +Pro->Thr@P 3.994915 3.9887 H(0)C(-1)N(0)O(1)S(0) 0.0 AA substitution 627 0.0 +Pro->Arg@P 59.048347 59.0705 H(5)C(1)N(3)O(0)S(0) 0.0 AA substitution 628 0.0 +Pro->Xle@P 16.0313 16.0425 H(4)C(1) 0.0 AA substitution 629 0.0 +Gln->Pro@Q -31.005814 -31.014 H(-1)C(0)N(-1)O(-1)S(0) 0.0 AA substitution 630 0.0 +Gln->Lys@Q 0.036386 0.0431 H(4)C(1)N(0)O(-1)S(0) 0.0 AA substitution 631 0.0 +Gln->Glu@Q 0.984016 0.9848 H(-1)C(0)N(-1)O(1)S(0) 0.0 AA substitution 632 0.0 +Gln->His@Q 9.000334 9.0101 H(-1)C(1)N(1)O(-1)S(0) 0.0 AA substitution 633 0.0 +Gln->Arg@Q 28.042534 28.0565 H(4)C(1)N(2)O(-1)S(0) 0.0 AA substitution 634 0.0 +Gln->Xle@Q -14.974514 -14.9716 H(3)C(1)N(-1)O(-1) 0.0 AA substitution 635 0.0 +Arg->Ser@R -69.069083 -69.1084 H(-7)C(-3)N(-3)O(1)S(0) 0.0 AA substitution 636 0.0 +Arg->Trp@R 29.978202 30.0242 H(-2)C(5)N(-2)O(0)S(0) 0.0 AA substitution 637 0.0 +Arg->Thr@R -55.053433 -55.0818 H(-5)C(-2)N(-3)O(1)S(0) 0.0 AA substitution 638 0.0 +Arg->Pro@R -59.048347 -59.0705 H(-5)C(-1)N(-3)O(0)S(0) 0.0 AA substitution 639 0.0 +Arg->Lys@R -28.006148 -28.0134 H(0)C(0)N(-2)O(0)S(0) 0.0 AA substitution 640 0.0 +Arg->His@R -19.042199 -19.0464 H(-5)C(0)N(-1)O(0)S(0) 0.0 AA substitution 641 0.0 +Arg->Gln@R -28.042534 -28.0565 H(-4)C(-1)N(-2)O(1)S(0) 0.0 AA substitution 642 0.0 +Arg->Met@R -25.060626 -24.9896 H(-3)C(-1)N(-3)O(0)S(1) 0.0 AA substitution 643 0.0 +Arg->Cys@R -53.091927 -53.0428 H(-7)C(-3)N(-3)O(0)S(1) 0.0 AA substitution 644 0.0 +Arg->Xle@R -43.017047 -43.028 H(-1)N(-3) 0.0 AA substitution 645 0.0 +Arg->Gly@R -99.079647 -99.1344 H(-9)C(-4)N(-3)O(0)S(0) 0.0 AA substitution 646 0.0 +Ser->Phe@S 60.036386 60.0966 H(4)C(6)N(0)O(-1)S(0) 0.0 AA substitution 647 0.0 +Ser->Ala@S -15.994915 -15.9994 H(0)C(0)N(0)O(-1)S(0) 0.0 AA substitution 648 0.0 +Ser->Trp@S 99.047285 99.1326 H(5)C(8)N(1)O(-1)S(0) 0.0 AA substitution 649 0.0 +Ser->Thr@S 14.01565 14.0266 H(2)C(1)N(0)O(0)S(0) 0.0 AA substitution 650 0.0 +Ser->Asn@S 27.010899 27.0253 H(1)C(1)N(1)O(0)S(0) 0.0 AA substitution 651 0.0 +Ser->Pro@S 10.020735 10.0379 H(2)C(2)N(0)O(-1)S(0) 0.0 AA substitution 652 0.0 +Ser->Tyr@S 76.0313 76.096 H(4)C(6)N(0)O(0)S(0) 0.0 AA substitution 653 0.0 +Ser->Cys@S 15.977156 16.0656 H(0)C(0)N(0)O(-1)S(1) 0.0 AA substitution 654 0.0 +Ser->Arg@S 69.069083 69.1084 H(7)C(3)N(3)O(-1)S(0) 0.0 AA substitution 655 0.0 +Ser->Xle@S 26.052036 26.0803 H(6)C(3)O(-1) 0.0 AA substitution 656 0.0 +Ser->Gly@S -30.010565 -30.026 H(-2)C(-1)N(0)O(-1)S(0) 0.0 AA substitution 657 0.0 +Thr->Ser@T -14.01565 -14.0266 H(-2)C(-1)N(0)O(0)S(0) 0.0 AA substitution 658 0.0 +Thr->Ala@T -30.010565 -30.026 H(-2)C(-1)N(0)O(-1)S(0) 0.0 AA substitution 659 0.0 +Thr->Asn@T 12.995249 12.9988 H(-1)C(0)N(1)O(0)S(0) 0.0 AA substitution 660 0.0 +Thr->Lys@T 27.047285 27.0684 H(5)C(2)N(1)O(-1)S(0) 0.0 AA substitution 661 0.0 +Thr->Pro@T -3.994915 -3.9887 H(0)C(1)N(0)O(-1)S(0) 0.0 AA substitution 662 0.0 +Thr->Met@T 29.992806 30.0922 H(2)C(1)N(0)O(-1)S(1) 0.0 AA substitution 663 0.0 +Thr->Xle@T 12.036386 12.0538 H(4)C(2)O(-1) 0.0 AA substitution 664 0.0 +Thr->Arg@T 55.053433 55.0818 H(5)C(2)N(3)O(-1)S(0) 0.0 AA substitution 665 0.0 +Val->Phe@V 48.0 48.0428 H(0)C(4)N(0)O(0)S(0) 0.0 AA substitution 666 0.0 +Val->Ala@V -28.0313 -28.0532 H(-4)C(-2)N(0)O(0)S(0) 0.0 AA substitution 667 0.0 +Val->Glu@V 29.974179 29.9829 H(-2)C(0)N(0)O(2)S(0) 0.0 AA substitution 668 0.0 +Val->Met@V 31.972071 32.065 H(0)C(0)N(0)O(0)S(1) 0.0 AA substitution 669 0.0 +Val->Asp@V 15.958529 15.9563 H(-4)C(-1)N(0)O(2)S(0) 0.0 AA substitution 670 0.0 +Val->Xle@V 14.01565 14.0266 H(2)C(1) 0.0 AA substitution 671 0.0 +Val->Gly@V -42.04695 -42.0797 H(-6)C(-3)N(0)O(0)S(0) 0.0 AA substitution 672 0.0 +Trp->Ser@W -99.047285 -99.1326 H(-5)C(-8)N(-1)O(1)S(0) 0.0 AA substitution 673 0.0 +Trp->Cys@W -83.070128 -83.067 H(-5)C(-8)N(-1)O(0)S(1) 0.0 AA substitution 674 0.0 +Trp->Arg@W -29.978202 -30.0242 H(2)C(-5)N(2)O(0)S(0) 0.0 AA substitution 675 0.0 +Trp->Gly@W -129.057849 -129.1586 H(-7)C(-9)N(-1)O(0)S(0) 0.0 AA substitution 676 0.0 +Trp->Xle@W -72.995249 -73.0523 H(1)C(-5)N(-1) 0.0 AA substitution 677 0.0 +Tyr->Phe@Y -15.994915 -15.9994 H(0)C(0)N(0)O(-1)S(0) 0.0 AA substitution 678 0.0 +Tyr->Ser@Y -76.0313 -76.096 H(-4)C(-6)N(0)O(0)S(0) 0.0 AA substitution 679 0.0 +Tyr->Asn@Y -49.020401 -49.0706 H(-3)C(-5)N(1)O(0)S(0) 0.0 AA substitution 680 0.0 +Tyr->His@Y -26.004417 -26.034 H(-2)C(-3)N(2)O(-1)S(0) 0.0 AA substitution 681 0.0 +Tyr->Asp@Y -48.036386 -48.0859 H(-4)C(-5)N(0)O(1)S(0) 0.0 AA substitution 682 0.0 +Tyr->Cys@Y -60.054144 -60.0304 H(-4)C(-6)N(0)O(-1)S(1) 0.0 AA substitution 683 0.0 +BDMAPP@W 253.010225 254.1231 H(12)C(11)N(1)O(1)Br(1) 0.0 Artefact 684 0.0 +BDMAPP@Y 253.010225 254.1231 H(12)C(11)N(1)O(1)Br(1) 0.0 Artefact 684 0.0 +BDMAPP@Protein_N-term 253.010225 254.1231 H(12)C(11)N(1)O(1)Br(1) 0.0 Chemical derivative 684 0.0 +BDMAPP@K 253.010225 254.1231 H(12)C(11)N(1)O(1)Br(1) 0.0 Chemical derivative 684 0.0 +BDMAPP@H 253.010225 254.1231 H(12)C(11)N(1)O(1)Br(1) 0.0 Artefact 684 0.0 +NA-LNO2@C 325.225309 325.443 H(31)C(18)N(1)O(4) 0.0 Post-translational 685 0.0 +NA-LNO2@H 325.225309 325.443 H(31)C(18)N(1)O(4) 0.0 Post-translational 685 0.0 +NA-OA-NO2@C 327.240959 327.4589 H(33)C(18)N(1)O(4) 0.0 Post-translational 686 0.0 +NA-OA-NO2@H 327.240959 327.4589 H(33)C(18)N(1)O(4) 0.0 Post-translational 686 0.0 +ICPL:2H(4)@Any_N-term 109.046571 109.1188 H(-1)2H(4)C(6)N(1)O(1) 0.0 Isotopic label 687 0.0 +ICPL:2H(4)@Protein_N-term 109.046571 109.1188 H(-1)2H(4)C(6)N(1)O(1) 0.0 Isotopic label 687 0.0 +ICPL:2H(4)@K 109.046571 109.1188 H(-1)2H(4)C(6)N(1)O(1) 0.0 Isotopic label 687 0.0 +CarboxymethylDTT@C 210.00205 210.2712 H(10)C(6)O(4)S(2) 0.0 Artefact 894 0.0 +iTRAQ8plex@Protein_N-term 304.20536 304.3074 H(24)C(7)13C(7)N(3)15N(1)O(3) 0.0 Isotopic label 730 0.0 +iTRAQ8plex@T 304.20536 304.3074 H(24)C(7)13C(7)N(3)15N(1)O(3) 0.0 Isotopic label 730 0.0 +iTRAQ8plex@S 304.20536 304.3074 H(24)C(7)13C(7)N(3)15N(1)O(3) 0.0 Isotopic label 730 0.0 +iTRAQ8plex@H 304.20536 304.3074 H(24)C(7)13C(7)N(3)15N(1)O(3) 0.0 Isotopic label 730 0.0 +iTRAQ8plex@Y 304.20536 304.3074 H(24)C(7)13C(7)N(3)15N(1)O(3) 0.0 Isotopic label 730 0.0 +iTRAQ8plex@Any_N-term 304.20536 304.3074 H(24)C(7)13C(7)N(3)15N(1)O(3) 0.0 Isotopic label 730 0.0 +iTRAQ8plex@K 304.20536 304.3074 H(24)C(7)13C(7)N(3)15N(1)O(3) 0.0 Isotopic label 730 0.0 +iTRAQ8plex@C 304.20536 304.3074 H(24)C(7)13C(7)N(3)15N(1)O(3) 0.0 Isotopic label 730 0.0 +Label:13C(6)15N(1)@I 7.017164 6.9493 C(-6)13C(6)N(-1)15N(1) 0.0 Isotopic label 695 0.0 +Label:13C(6)15N(1)@L 7.017164 6.9493 C(-6)13C(6)N(-1)15N(1) 0.0 Isotopic label 695 0.0 +Label:2H(9)13C(6)15N(2)@K 17.07069 16.9982 H(-9)2H(9)C(-6)13C(6)N(-2)15N(2) 0.0 Isotopic label 696 0.0 +HNE-Delta:H(2)O@K 138.104465 138.2069 H(14)C(9)O(1) 0.0 Chemical derivative 720 0.0 +HNE-Delta:H(2)O@H 138.104465 138.2069 H(14)C(9)O(1) 0.0 Chemical derivative 720 0.0 +HNE-Delta:H(2)O@C 138.104465 138.2069 H(14)C(9)O(1) 0.0 Chemical derivative 720 0.0 +4-ONE@K 154.09938 154.2063 H(14)C(9)O(2) 0.0 Chemical derivative 721 0.0 +4-ONE@H 154.09938 154.2063 H(14)C(9)O(2) 0.0 Chemical derivative 721 0.0 +4-ONE@C 154.09938 154.2063 H(14)C(9)O(2) 0.0 Chemical derivative 721 0.0 +O-Dimethylphosphate@Y 107.997631 108.0331 H(5)C(2)O(3)P(1) 0.0 Chemical derivative 723 0.0 +O-Dimethylphosphate@T 107.997631 108.0331 H(5)C(2)O(3)P(1) 0.0 Chemical derivative 723 0.0 +O-Dimethylphosphate@S 107.997631 108.0331 H(5)C(2)O(3)P(1) 0.0 Chemical derivative 723 0.0 +O-Methylphosphate@Y 93.981981 94.0065 H(3)C(1)O(3)P(1) 0.0 Chemical derivative 724 0.0 +O-Methylphosphate@T 93.981981 94.0065 H(3)C(1)O(3)P(1) 0.0 Chemical derivative 724 0.0 +O-Methylphosphate@S 93.981981 94.0065 H(3)C(1)O(3)P(1) 0.0 Chemical derivative 724 0.0 +Diethylphosphate@Any_N-term 136.028931 136.0862 H(9)C(4)O(3)P(1) 0.0 Chemical derivative 725 0.0 +Diethylphosphate@H 136.028931 136.0862 H(9)C(4)O(3)P(1) 0.0 Chemical derivative 725 0.0 +Diethylphosphate@C 136.028931 136.0862 H(9)C(4)O(3)P(1) 0.0 Chemical derivative 725 0.0 +Diethylphosphate@K 136.028931 136.0862 H(9)C(4)O(3)P(1) 0.0 Chemical derivative 725 0.0 +Diethylphosphate@Y 136.028931 136.0862 H(9)C(4)O(3)P(1) 0.0 Chemical derivative 725 0.0 +Diethylphosphate@T 136.028931 136.0862 H(9)C(4)O(3)P(1) 0.0 Chemical derivative 725 0.0 +Diethylphosphate@S 136.028931 136.0862 H(9)C(4)O(3)P(1) 0.0 Chemical derivative 725 0.0 +Ethylphosphate@Any_N-term 107.997631 108.0331 H(5)C(2)O(3)P(1) 0.0 Chemical derivative 726 0.0 +Ethylphosphate@K 107.997631 108.0331 H(5)C(2)O(3)P(1) 0.0 Chemical derivative 726 0.0 +Ethylphosphate@Y 107.997631 108.0331 H(5)C(2)O(3)P(1) 0.0 Chemical derivative 726 0.0 +Ethylphosphate@T 107.997631 108.0331 H(5)C(2)O(3)P(1) 0.0 Chemical derivative 726 0.0 +Ethylphosphate@S 107.997631 108.0331 H(5)C(2)O(3)P(1) 0.0 Chemical derivative 726 0.0 +O-pinacolylmethylphosphonate@T 162.080967 162.1666 H(15)C(7)O(2)P(1) 0.0 Chemical derivative 727 0.0 +O-pinacolylmethylphosphonate@S 162.080967 162.1666 H(15)C(7)O(2)P(1) 0.0 Chemical derivative 727 0.0 +O-pinacolylmethylphosphonate@K 162.080967 162.1666 H(15)C(7)O(2)P(1) 0.0 Chemical derivative 727 0.0 +O-pinacolylmethylphosphonate@Y 162.080967 162.1666 H(15)C(7)O(2)P(1) 0.0 Chemical derivative 727 0.0 +O-pinacolylmethylphosphonate@H 162.080967 162.1666 H(15)C(7)O(2)P(1) 0.0 Chemical derivative 727 0.0 +Methylphosphonate@Y 77.987066 78.0071 H(3)C(1)O(2)P(1) 0.0 Chemical derivative 728 0.0 +Methylphosphonate@T 77.987066 78.0071 H(3)C(1)O(2)P(1) 0.0 Chemical derivative 728 0.0 +Methylphosphonate@S 77.987066 78.0071 H(3)C(1)O(2)P(1) 0.0 Chemical derivative 728 0.0 +O-Isopropylmethylphosphonate@Y 120.034017 120.0868 H(9)C(4)O(2)P(1) 0.0 Chemical derivative 729 0.0 +O-Isopropylmethylphosphonate@T 120.034017 120.0868 H(9)C(4)O(2)P(1) 0.0 Chemical derivative 729 0.0 +O-Isopropylmethylphosphonate@S 120.034017 120.0868 H(9)C(4)O(2)P(1) 0.0 Chemical derivative 729 0.0 +iTRAQ8plex:13C(6)15N(2)@Y 304.19904 304.3081 H(24)C(8)13C(6)N(2)15N(2)O(3) 0.0 Isotopic label 731 0.0 +iTRAQ8plex:13C(6)15N(2)@Any_N-term 304.19904 304.3081 H(24)C(8)13C(6)N(2)15N(2)O(3) 0.0 Isotopic label 731 0.0 +iTRAQ8plex:13C(6)15N(2)@K 304.19904 304.3081 H(24)C(8)13C(6)N(2)15N(2)O(3) 0.0 Isotopic label 731 0.0 +iTRAQ8plex:13C(6)15N(2)@C 304.19904 304.3081 H(24)C(8)13C(6)N(2)15N(2)O(3) 0.0 Isotopic label 731 0.0 +BEMAD_ST@T 136.001656 136.2357 H(8)C(4)O(1)S(2) 0.0 Chemical derivative 735 0.0 +BEMAD_ST@S 136.001656 136.2357 H(8)C(4)O(1)S(2) 0.0 Chemical derivative 735 0.0 +Ethanolamine@D 43.042199 43.0678 H(5)C(2)N(1) 0.0 Chemical derivative 734 0.0 +Ethanolamine@Any_C-term 43.042199 43.0678 H(5)C(2)N(1) 0.0 Chemical derivative 734 0.0 +Ethanolamine@E 43.042199 43.0678 H(5)C(2)N(1) 0.0 Chemical derivative 734 0.0 +Ethanolamine@C 43.042199 43.0678 H(5)C(2)N(1) 0.0 Chemical derivative 734 0.0 +TMT6plex@T 229.162932 229.2634 H(20)C(8)13C(4)N(1)15N(1)O(2) 0.0 Isotopic label 737 0.0 +TMT6plex@S 229.162932 229.2634 H(20)C(8)13C(4)N(1)15N(1)O(2) 0.0 Isotopic label 737 0.0 +TMT6plex@H 229.162932 229.2634 H(20)C(8)13C(4)N(1)15N(1)O(2) 0.0 Isotopic label 737 0.0 +TMT6plex@Protein_N-term 229.162932 229.2634 H(20)C(8)13C(4)N(1)15N(1)O(2) 0.0 Isotopic label 737 0.0 +TMT6plex@Any_N-term 229.162932 229.2634 H(20)C(8)13C(4)N(1)15N(1)O(2) 0.0 Isotopic label 737 0.0 +TMT6plex@K 229.162932 229.2634 H(20)C(8)13C(4)N(1)15N(1)O(2) 0.0 Isotopic label 737 0.0 +BEMAD_C@C 120.0245 120.1701 H(8)C(4)O(2)S(1) 0.0 Chemical derivative 736 0.0 +TMT2plex@H 225.155833 225.2921 H(20)C(11)13C(1)N(2)O(2) 0.0 Isotopic label 738 0.0 +TMT2plex@S 225.155833 225.2921 H(20)C(11)13C(1)N(2)O(2) 0.0 Isotopic label 738 0.0 +TMT2plex@T 225.155833 225.2921 H(20)C(11)13C(1)N(2)O(2) 0.0 Isotopic label 738 0.0 +TMT2plex@Protein_N-term 225.155833 225.2921 H(20)C(11)13C(1)N(2)O(2) 0.0 Isotopic label 738 0.0 +TMT2plex@Any_N-term 225.155833 225.2921 H(20)C(11)13C(1)N(2)O(2) 0.0 Isotopic label 738 0.0 +TMT2plex@K 225.155833 225.2921 H(20)C(11)13C(1)N(2)O(2) 0.0 Isotopic label 738 0.0 +TMT@Protein_N-term 224.152478 224.2994 H(20)C(12)N(2)O(2) 0.0 Chemical derivative 739 0.0 +TMT@Any_N-term 224.152478 224.2994 H(20)C(12)N(2)O(2) 0.0 Chemical derivative 739 0.0 +TMT@K 224.152478 224.2994 H(20)C(12)N(2)O(2) 0.0 Chemical derivative 739 0.0 +TMT@H 224.152478 224.2994 H(20)C(12)N(2)O(2) 0.0 Isotopic label 739 0.0 +TMT@S 224.152478 224.2994 H(20)C(12)N(2)O(2) 0.0 Isotopic label 739 0.0 +TMT@T 224.152478 224.2994 H(20)C(12)N(2)O(2) 0.0 Isotopic label 739 0.0 +ExacTagThiol@C 972.365219 972.7268 H(50)C(23)13C(12)N(8)15N(6)O(18) 0.0 Isotopic label 740 0.0 +ExacTagAmine@K 1046.347854 1046.8285 H(52)C(25)13C(12)N(8)15N(6)O(19)S(1) 0.0 Isotopic label 741 0.0 +NO_SMX_SEMD@C 251.036462 251.2618 H(9)C(10)N(3)O(3)S(1) 0.0 Chemical derivative 744 0.0 +4-ONE+Delta:H(-2)O(-1)@K 136.088815 136.191 H(12)C(9)O(1) 0.0 Chemical derivative 743 0.0 +4-ONE+Delta:H(-2)O(-1)@H 136.088815 136.191 H(12)C(9)O(1) 0.0 Chemical derivative 743 0.0 +4-ONE+Delta:H(-2)O(-1)@C 136.088815 136.191 H(12)C(9)O(1) 0.0 Chemical derivative 743 0.0 +Biotin:Aha-DADPS@M 922.465403 923.2022 H(70)C(42)N(8)O(11)S(1)Si(1) 0.0 Chemical derivative 2052 0.0 +NO_SMX_SIMD@C 267.031377 267.2612 H(9)C(10)N(3)O(4)S(1) 0.0 Chemical derivative 746 0.0 +Malonyl@C 86.000394 86.0462 H(2)C(3)O(3) 0.0 Chemical derivative 747 0.0 +Malonyl@S 86.000394 86.0462 H(2)C(3)O(3) 0.0 Chemical derivative 747 0.0 +Malonyl@K 86.000394 86.0462 H(2)C(3)O(3) 0.0 Post-translational 747 N([Xe])([Xe])[C@@H](CCCC(NC(=O)CC(=O)O))C(=O)[Rn] 0.0 +3sulfo@Any_N-term 183.983029 184.1693 H(4)C(7)O(4)S(1) 0.0 Chemical derivative 748 0.0 +trifluoro@L 53.971735 53.9714 H(-3)F(3) 0.0 Non-standard residue 750 0.0 +TNBS@Any_N-term 210.986535 211.0886 H(1)C(6)N(3)O(6) 0.0 Chemical derivative 751 0.0 +TNBS@K 210.986535 211.0886 H(1)C(6)N(3)O(6) 0.0 Chemical derivative 751 0.0 +Biotin-phenacyl@C 626.263502 626.727 H(38)C(29)N(8)O(6)S(1) 0.0 Chemical derivative 774 0.0 +Biotin-phenacyl@H 626.263502 626.727 H(38)C(29)N(8)O(6)S(1) 0.0 Chemical derivative 774 0.0 +Biotin-phenacyl@S 626.263502 626.727 H(38)C(29)N(8)O(6)S(1) 0.0 Chemical derivative 774 0.0 +BEMAD_C:2H(6)@C 126.062161 126.2071 H(2)2H(6)C(4)O(2)S(1) 0.0 Isotopic label 764 0.0 +lapachenole@C 240.11503 240.297 H(16)C(16)O(2) 0.0 Chemical derivative 771 0.0 +Label:13C(5)@P 5.016774 4.9633 C(-5)13C(5) 0.0 Isotopic label 772 0.0 +maleimide@K 97.016378 97.0721 H(3)C(4)N(1)O(2) 0.0 Chemical derivative 773 0.0 +maleimide@C 97.016378 97.0721 H(3)C(4)N(1)O(2) 0.0 Chemical derivative 773 0.0 +IDEnT@C 214.990469 216.064 H(7)C(9)N(1)O(1)Cl(2) 0.0 Isotopic label 762 0.0 +BEMAD_ST:2H(6)@T 142.039317 142.2727 H(2)2H(6)C(4)O(1)S(2) 0.0 Isotopic label 763 0.0 +BEMAD_ST:2H(6)@S 142.039317 142.2727 H(2)2H(6)C(4)O(1)S(2) 0.0 Isotopic label 763 0.0 +Met-loss@M^Protein_N-term -131.040485 -131.1961 H(-9)C(-5)N(-1)O(-1)S(-1) 0.0 Co-translational 765 0.0 +Met-loss+Acetyl@M^Protein_N-term -89.02992 -89.1594 H(-7)C(-3)N(-1)S(-1) 0.0 Co-translational 766 0.0 +Menadione-HQ@K 172.05243 172.18 H(8)C(11)O(2) 0.0 Chemical derivative 767 0.0 +Menadione-HQ@C 172.05243 172.18 H(8)C(11)O(2) 0.0 Chemical derivative 767 0.0 +Carboxymethyl:13C(2)@C 60.012189 60.0214 H(2)13C(2)O(2) 0.0 Chemical derivative 775 0.0 +NEM:2H(5)@C 130.079062 130.1561 H(2)2H(5)C(6)N(1)O(2) 0.0 Chemical derivative 776 0.0 +Gly-loss+Amide@G^Any_C-term -58.005479 -58.0361 H(-2)C(-2)O(-2) 0.0 Post-translational 822 0.0 +TMPP-Ac@Any_N-term 572.181134 572.5401 H(33)C(29)O(10)P(1) 0.0 Chemical derivative 827 0.0 +TMPP-Ac@K 572.181134 572.5401 H(33)C(29)O(10)P(1) 0.0 Artefact 827 0.0 +TMPP-Ac@Y 572.181134 572.5401 H(33)C(29)O(10)P(1) 0.0 Artefact 827 0.0 +Label:13C(6)+GG@K 120.063056 120.0586 H(6)C(-2)13C(6)N(2)O(2) 0.0 Isotopic label 799 0.0 +Arg->Npo@R 80.985078 81.0297 H(-1)C(3)N(1)O(2) 0.0 Chemical derivative 837 0.0 +Label:2H(4)+Acetyl@K 46.035672 46.0613 H(-2)2H(4)C(2)O(1) 0.0 Isotopic label 834 0.0 +Pentylamine@Q 70.07825 70.1329 H(10)C(5) 0.0 Chemical derivative 801 0.0 +Biotin:Thermo-21345@Q 311.166748 311.4429 H(25)C(15)N(3)O(2)S(1) 0.0 Chemical derivative 800 0.0 +Dihydroxyimidazolidine@R 72.021129 72.0627 H(4)C(3)O(2) 0.0 Multiple 830 0.0 +Xlink:DFDNB@N 163.985807 164.0752 C(6)N(2)O(4) 0.0 Chemical derivative 825 0.0 +Xlink:DFDNB@Q 163.985807 164.0752 C(6)N(2)O(4) 0.0 Chemical derivative 825 0.0 +Xlink:DFDNB@R 163.985807 164.0752 C(6)N(2)O(4) 0.0 Chemical derivative 825 0.0 +Xlink:DFDNB@K 163.985807 164.0752 C(6)N(2)O(4) 0.0 Chemical derivative 825 0.0 +Cy3b-maleimide@C 682.24612 682.7852 H(38)C(37)N(4)O(7)S(1) 0.0 Chemical derivative 821 0.0 +Hex(1)HexNAc(1)@N 365.132196 365.3331 H(23)C(14)N(1)O(10) 365.132196 H(23)C(14)N(1)O(10) N-linked glycosylation 793 0.5 +Hex(1)HexNAc(1)@T 365.132196 365.3331 H(23)C(14)N(1)O(10) 365.132196 H(23)C(14)N(1)O(10) O-linked glycosylation 793 0.5 +Hex(1)HexNAc(1)@S 365.132196 365.3331 H(23)C(14)N(1)O(10) 365.132196 H(23)C(14)N(1)O(10) O-linked glycosylation 793 0.5 +AEC-MAEC:2H(4)@S 63.044462 63.158 H(1)2H(4)C(2)N(1)O(-1)S(1) 0.0 Isotopic label 792 0.0 +AEC-MAEC:2H(4)@T 63.044462 63.158 H(1)2H(4)C(2)N(1)O(-1)S(1) 0.0 Isotopic label 792 0.0 +Xlink:BMOE@C 220.048407 220.1815 H(8)C(10)N(2)O(4) 0.0 Chemical derivative 824 0.0 +Biotin:Thermo-21360@Anywhere 487.246455 487.6134 H(37)C(21)N(5)O(6)S(1) 0.0 Chemical derivative 811 0.0 +Label:13C(6)+Acetyl@K 48.030694 47.9926 H(2)C(-4)13C(6)O(1) 0.0 Isotopic label 835 0.0 +Label:13C(6)15N(2)+Acetyl@K 50.024764 49.9794 H(2)C(-4)13C(6)N(-2)15N(2)O(1) 0.0 Isotopic label 836 0.0 +EQIGG@K 484.228162 484.5035 H(32)C(20)N(6)O(8) 0.0 Other 846 0.0 +cGMP@S 343.031785 343.1895 H(10)C(10)N(5)O(7)P(1) 0.0 Post-translational 849 0.0 +cGMP@C 343.031785 343.1895 H(10)C(10)N(5)O(7)P(1) 0.0 Post-translational 849 0.0 +cGMP+RMP-loss@C 150.041585 150.1182 H(4)C(5)N(5)O(1) 0.0 Post-translational 851 0.0 +cGMP+RMP-loss@S 150.041585 150.1182 H(4)C(5)N(5)O(1) 0.0 Post-translational 851 0.0 +mTRAQ@Y 140.094963 140.183 H(12)C(7)N(2)O(1) 0.0 Isotopic label 888 0.0 +mTRAQ@Any_N-term 140.094963 140.183 H(12)C(7)N(2)O(1) 0.0 Isotopic label 888 C(=O)CN1CCN(CC1)C 0.0 +mTRAQ@K 140.094963 140.183 H(12)C(7)N(2)O(1) 0.0 Isotopic label 888 [H]N(CCCC[C@H](N([Xe])([Xe]))C(=O)[Rn])C(=O)CN1CCN(C)CC1 0.0 +mTRAQ@H 140.094963 140.183 H(12)C(7)N(2)O(1) 0.0 Isotopic label 888 0.0 +mTRAQ@S 140.094963 140.183 H(12)C(7)N(2)O(1) 0.0 Isotopic label 888 0.0 +mTRAQ@T 140.094963 140.183 H(12)C(7)N(2)O(1) 0.0 Isotopic label 888 0.0 +Arg2PG@R 266.057909 266.2482 H(10)C(16)O(4) 0.0 Chemical derivative 848 0.0 +Label:2H(4)+GG@K 118.068034 118.1273 H(2)2H(4)C(4)N(2)O(2) 0.0 Post-translational 853 0.0 +spermine@Q 185.189198 185.3097 H(23)C(10)N(3) 0.0 Chemical derivative 1420 0.0 +Label:13C(1)2H(3)@M 4.022185 4.0111 H(-3)2H(3)C(-1)13C(1) 0.0 Isotopic label 862 0.0 +ZGB@K 758.380841 758.7261 H(53)C(37)N(6)O(6)F(2)S(1)B(1) 0.0 Other 861 0.0 +ZGB@Any_N-term 758.380841 758.7261 H(53)C(37)N(6)O(6)F(2)S(1)B(1) 0.0 Other 861 0.0 +MG-H1@R 54.010565 54.0474 H(2)C(3)O(1) 0.0 Other 859 0.0 +G-H1@R 39.994915 40.0208 C(2)O(1) 0.0 Other 860 0.0 +Label:13C(6)15N(2)+GG@K 122.057126 122.0454 H(6)C(-2)13C(6)15N(2)O(2) 0.0 Isotopic label 864 0.0 +ICPL:13C(6)2H(4)@Any_N-term 115.0667 115.0747 H(-1)2H(4)13C(6)N(1)O(1) 0.0 Isotopic label 866 0.0 +ICPL:13C(6)2H(4)@K 115.0667 115.0747 H(-1)2H(4)13C(6)N(1)O(1) 0.0 Isotopic label 866 0.0 +ICPL:13C(6)2H(4)@Protein_N-term 115.0667 115.0747 H(-1)2H(4)13C(6)N(1)O(1) 0.0 Isotopic label 866 0.0 +DyLight-maleimide@C 940.1999 941.0762 H(48)C(39)N(4)O(15)S(4) 0.0 Chemical derivative 890 0.0 +mTRAQ:13C(3)15N(1)@S 144.102063 144.1544 H(12)C(4)13C(3)N(1)15N(1)O(1) 0.0 Isotopic label 889 0.0 +mTRAQ:13C(3)15N(1)@T 144.102063 144.1544 H(12)C(4)13C(3)N(1)15N(1)O(1) 0.0 Isotopic label 889 0.0 +mTRAQ:13C(3)15N(1)@H 144.102063 144.1544 H(12)C(4)13C(3)N(1)15N(1)O(1) 0.0 Isotopic label 889 0.0 +mTRAQ:13C(3)15N(1)@Y 144.102063 144.1544 H(12)C(4)13C(3)N(1)15N(1)O(1) 0.0 Isotopic label 889 0.0 +mTRAQ:13C(3)15N(1)@Any_N-term 144.102063 144.1544 H(12)C(4)13C(3)N(1)15N(1)O(1) 0.0 Isotopic label 889 C(=O)[13C]([H])([H])[15N]1[13C]([H])([H])[13C]([H])([H])N(CC1)C 0.0 +mTRAQ:13C(3)15N(1)@K 144.102063 144.1544 H(12)C(4)13C(3)N(1)15N(1)O(1) 0.0 Isotopic label 889 [H]N(CCCC[C@H](N([Xe])([Xe]))C(=O)[Rn])C(=O)[13CH2][15N]1CCN(C)[13CH2][13CH2]1 0.0 +Methyl-PEO12-Maleimide@C 710.383719 710.8073 H(58)C(32)N(2)O(15) 0.0 Chemical derivative 891 0.0 +MDCC@C 383.148121 383.3978 H(21)C(20)N(3)O(5) 0.0 Chemical derivative 887 0.0 +QQQTGG@K 599.266339 599.5942 H(37)C(23)N(9)O(10) 0.0 Other 877 0.0 +QEQTGG@K 600.250354 600.5789 H(36)C(23)N(8)O(11) 0.0 Other 876 0.0 +HydroxymethylOP@K 108.021129 108.0948 H(4)C(6)O(2) 0.0 Other 886 0.0 +Biotin:Thermo-21325@K 695.310118 695.8288 H(45)C(34)N(7)O(7)S(1) 0.0 Chemical derivative 884 0.0 +Label:13C(1)2H(3)+Oxidation@M 20.0171 20.0105 H(-3)2H(3)C(-1)13C(1)O(1) 0.0 Multiple 885 0.0 +shTMTpro@K 313.231019 313.2473 H(25)13C(15)15N(3)O(3) 0.0 Chemical derivative 2050 0.0 +shTMTpro@Protein_N-term 313.231019 313.2473 H(25)13C(15)15N(3)O(3) 0.0 Chemical derivative 2050 0.0 +shTMTpro@Any_N-term 313.231019 313.2473 H(25)13C(15)15N(3)O(3) 0.0 Chemical derivative 2050 0.0 +Biotin-PEG-PRA@M 578.317646 578.6611 H(42)C(26)N(8)O(7) 0.0 Chemical derivative 895 0.0 +Met->Aha@M -4.986324 -5.0794 H(-3)C(-1)N(3)S(-1) 0.0 Non-standard residue 896 0.0 +Label:15N(4)@R 3.98814 3.9736 N(-4)15N(4) 0.0 Isotopic label 897 0.0 +pyrophospho@T 159.932662 159.9598 H(2)O(6)P(2) 176.935402 H(3)O(7)P(2) Post-translational 898 0.5 +pyrophospho@S 159.932662 159.9598 H(2)O(6)P(2) 176.935402 H(3)O(7)P(2) Post-translational 898 0.5 +Met->Hpg@M -21.987721 -22.0702 H(-2)C(1)S(-1) 0.0 Non-standard residue 899 0.0 +4AcAllylGal@C 372.142033 372.3671 H(24)C(17)O(9) 0.0 Chemical derivative 901 0.0 +DimethylArsino@C 103.960719 103.9827 H(5)C(2)As(1) 0.0 Post-translational 902 0.0 +Lys->CamCys@K 31.935685 32.0219 H(-4)C(-1)O(1)S(1) 0.0 Pre-translational 903 0.0 +Phe->CamCys@F 12.962234 13.0204 H(-1)C(-4)N(1)O(1)S(1) 0.0 Pre-translational 904 0.0 +Leu->MetOx@L 33.951335 34.0378 H(-2)C(-1)O(1)S(1) 0.0 Pre-translational 905 0.0 +Lys->MetOx@K 18.940436 19.0232 H(-3)C(-1)N(-1)O(1)S(1) 0.0 Pre-translational 906 0.0 +Galactosyl@Any_N-term 178.047738 178.14 H(10)C(6)O(6) 0.0 Other glycosylation 907 0.0 +Galactosyl@K 178.047738 178.14 H(10)C(6)O(6) 0.0 Other glycosylation 907 0.0 +Xlink:SMCC[321]@C 321.205242 321.4146 H(27)C(17)N(3)O(3) 0.0 Chemical derivative 908 0.0 +Bacillosamine@N 228.111007 228.245 H(16)C(10)N(2)O(4) 228.111007 H(16)C(10)N(2)O(4) N-linked glycosylation 910 0.5 +MTSL@C 184.07961 184.2786 H(14)C(9)N(1)O(1)S(1) 0.0 Chemical derivative 911 0.0 +HNE-BAHAH@H 511.319226 511.7209 H(45)C(25)N(5)O(4)S(1) 0.0 Chemical derivative 912 0.0 +HNE-BAHAH@C 511.319226 511.7209 H(45)C(25)N(5)O(4)S(1) 0.0 Chemical derivative 912 0.0 +HNE-BAHAH@K 511.319226 511.7209 H(45)C(25)N(5)O(4)S(1) 0.0 Chemical derivative 912 0.0 +LTX+Lophotoxin@Y 416.147118 416.4212 H(24)C(22)O(8) 0.0 Post-translational 2039 0.0 +Methylmalonylation@S 100.016044 100.0728 H(4)C(4)O(3) 0.0 Chemical derivative 914 0.0 +AROD@C 820.336015 820.979 H(52)C(35)N(10)O(9)S(2) 0.0 Chemical derivative 938 0.0 +Cys->methylaminoAla@C -2.945522 -3.0238 H(3)C(1)N(1)S(-1) 0.0 Chemical derivative 939 0.0 +Cys->ethylaminoAla@C 11.070128 11.0028 H(5)C(2)N(1)S(-1) 0.0 Chemical derivative 940 0.0 +Label:13C(4)15N(2)+GG@K 120.050417 120.0601 H(6)13C(4)15N(2)O(2) 0.0 Isotopic label 923 0.0 +ethylamino@S 27.047285 27.0684 H(5)C(2)N(1)O(-1) 0.0 Chemical derivative 926 0.0 +ethylamino@T 27.047285 27.0684 H(5)C(2)N(1)O(-1) 0.0 Chemical derivative 926 0.0 +MercaptoEthanol@S 60.003371 60.1182 H(4)C(2)S(1) 0.0 Chemical derivative 928 0.0 +MercaptoEthanol@T 60.003371 60.1182 H(4)C(2)S(1) 0.0 Chemical derivative 928 0.0 +Atto495Maleimide@C 474.250515 474.5747 H(32)C(27)N(5)O(3) 0.0 Chemical derivative 935 0.0 +AMTzHexNAc2@T 502.202341 502.4757 H(30)C(19)N(6)O(10) 0.0 Chemical derivative 934 0.0 +AMTzHexNAc2@S 502.202341 502.4757 H(30)C(19)N(6)O(10) 0.0 Chemical derivative 934 0.0 +AMTzHexNAc2@N 502.202341 502.4757 H(30)C(19)N(6)O(10) 0.0 Chemical derivative 934 0.0 +Ethyl+Deamidated@Q 29.015316 29.0379 H(3)C(2)N(-1)O(1) 0.0 Chemical derivative 931 0.0 +Ethyl+Deamidated@N 29.015316 29.0379 H(3)C(2)N(-1)O(1) 0.0 Chemical derivative 931 0.0 +VFQQQTGG@K 845.403166 845.8991 H(55)C(37)N(11)O(12) 0.0 Other 932 0.0 +VIEVYQEQTGG@K 1203.577168 1204.2859 H(81)C(53)N(13)O(19) 0.0 Other 933 0.0 +Chlorination@W 33.961028 34.4451 H(-1)Cl(1) 0.0 Artefact 936 0.0 +Chlorination@Y 33.961028 34.4451 H(-1)Cl(1) 0.0 Artefact 936 0.0 +dichlorination@C 67.922055 68.8901 H(-2)Cl(2) 0.0 Chemical derivative 937 0.0 +dichlorination@Y 67.922055 68.8901 H(-2)Cl(2) 0.0 Artefact 937 0.0 +DNPS@C 198.981352 199.164 H(3)C(6)N(2)O(4)S(1) 0.0 Chemical derivative 941 0.0 +DNPS@W 198.981352 199.164 H(3)C(6)N(2)O(4)S(1) 0.0 Chemical derivative 941 0.0 +SulfoGMBS@C 458.162391 458.5306 H(26)C(22)N(4)O(5)S(1) 0.0 Other 942 0.0 +DimethylamineGMBS@C 267.158292 267.3241 H(21)C(13)N(3)O(3) 0.0 Chemical derivative 943 0.0 +Label:15N(2)2H(9)@K 11.050561 11.0423 H(-9)2H(9)N(-2)15N(2) 0.0 Isotopic label 944 0.0 +LG-anhydrolactam@Any_N-term 314.188195 314.4186 H(26)C(20)O(3) 0.0 Post-translational 946 0.0 +LG-anhydrolactam@K 314.188195 314.4186 H(26)C(20)O(3) 0.0 Post-translational 946 0.0 +LG-pyrrole@C 316.203845 316.4345 H(28)C(20)O(3) 0.0 Post-translational 947 0.0 +LG-pyrrole@Any_N-term 316.203845 316.4345 H(28)C(20)O(3) 0.0 Post-translational 947 0.0 +LG-pyrrole@K 316.203845 316.4345 H(28)C(20)O(3) 0.0 Post-translational 947 0.0 +LG-anhyropyrrole@Any_N-term 298.19328 298.4192 H(26)C(20)O(2) 0.0 Post-translational 948 0.0 +LG-anhyropyrrole@K 298.19328 298.4192 H(26)C(20)O(2) 0.0 Post-translational 948 0.0 +3-deoxyglucosone@R 144.042259 144.1253 H(8)C(6)O(4) 0.0 Multiple 949 0.0 +Cation:Li@D 6.008178 5.9331 H(-1)Li(1) 0.0 Artefact 950 0.0 +Cation:Li@E 6.008178 5.9331 H(-1)Li(1) 0.0 Artefact 950 0.0 +Cation:Li@Any_C-term 6.008178 5.9331 H(-1)Li(1) 0.0 Artefact 950 O[Li] 0.0 +Cation:Ca[II]@Any_C-term 37.946941 38.0621 H(-2)Ca(1) 0.0 Artefact 951 0.0 +Cation:Ca[II]@E 37.946941 38.0621 H(-2)Ca(1) 0.0 Artefact 951 0.0 +Cation:Ca[II]@D 37.946941 38.0621 H(-2)Ca(1) 0.0 Artefact 951 0.0 +Cation:Fe[II]@D 53.919289 53.8291 H(-2)Fe(1) 0.0 Artefact 952 0.0 +Cation:Fe[II]@E 53.919289 53.8291 H(-2)Fe(1) 0.0 Artefact 952 0.0 +Cation:Fe[II]@Any_C-term 53.919289 53.8291 H(-2)Fe(1) 0.0 Artefact 952 0.0 +Cation:Ni[II]@D 55.919696 56.6775 H(-2)Ni(1) 0.0 Artefact 953 0.0 +Cation:Ni[II]@E 55.919696 56.6775 H(-2)Ni(1) 0.0 Artefact 953 0.0 +Cation:Ni[II]@Any_C-term 55.919696 56.6775 H(-2)Ni(1) 0.0 Artefact 953 0.0 +Cation:Zn[II]@Any_C-term 61.913495 63.3931 H(-2)Zn(1) 0.0 Artefact 954 0.0 +Cation:Zn[II]@E 61.913495 63.3931 H(-2)Zn(1) 0.0 Artefact 954 0.0 +Cation:Zn[II]@D 61.913495 63.3931 H(-2)Zn(1) 0.0 Artefact 954 0.0 +Cation:Zn[II]@H 61.913495 63.3931 H(-2)Zn(1) 0.0 Artefact 954 0.0 +Cation:Ag@D 105.897267 106.8603 H(-1)Ag(1) 0.0 Artefact 955 0.0 +Cation:Ag@E 105.897267 106.8603 H(-1)Ag(1) 0.0 Artefact 955 0.0 +Cation:Ag@Any_C-term 105.897267 106.8603 H(-1)Ag(1) 0.0 Artefact 955 0.0 +Cation:Mg[II]@D 21.969392 22.2891 H(-2)Mg(1) 0.0 Artefact 956 0.0 +Cation:Mg[II]@E 21.969392 22.2891 H(-2)Mg(1) 0.0 Artefact 956 0.0 +Cation:Mg[II]@Any_C-term 21.969392 22.2891 H(-2)Mg(1) 0.0 Artefact 956 0.0 +2-succinyl@C 116.010959 116.0722 H(4)C(4)O(4) 0.0 Chemical derivative 957 0.0 +Propargylamine@D 37.031634 37.0632 H(3)C(3)N(1)O(-1) 0.0 Chemical derivative 958 0.0 +Propargylamine@Any_C-term 37.031634 37.0632 H(3)C(3)N(1)O(-1) 0.0 Chemical derivative 958 0.0 +Propargylamine@E 37.031634 37.0632 H(3)C(3)N(1)O(-1) 0.0 Chemical derivative 958 0.0 +Phosphopropargyl@T 116.997965 117.0431 H(4)C(3)N(1)O(2)P(1) 0.0 Multiple 959 0.0 +Phosphopropargyl@Y 116.997965 117.0431 H(4)C(3)N(1)O(2)P(1) 0.0 Multiple 959 0.0 +Phosphopropargyl@S 116.997965 117.0431 H(4)C(3)N(1)O(2)P(1) 0.0 Multiple 959 0.0 +SUMO2135@K 2135.920496 2137.2343 H(137)C(90)N(21)O(37)S(1) 0.0 Other 960 0.0 +SUMO3549@K 3549.536568 3551.6672 H(224)C(150)N(38)O(60)S(1) 0.0 Other 961 0.0 +serotonylation@Q 159.068414 159.1846 H(9)C(10)N(1)O(1) 0.0 Post-translational 1992 0.0 +BITC@Any_N-term 149.02992 149.2129 H(7)C(8)N(1)S(1) 0.0 Chemical derivative 978 0.0 +BITC@K 149.02992 149.2129 H(7)C(8)N(1)S(1) 0.0 Chemical derivative 978 0.0 +BITC@C 149.02992 149.2129 H(7)C(8)N(1)S(1) 0.0 Chemical derivative 978 0.0 +Carbofuran@S 57.021464 57.0513 H(3)C(2)N(1)O(1) 0.0 Chemical derivative 977 0.0 +PEITC@Any_N-term 163.04557 163.2395 H(9)C(9)N(1)S(1) 0.0 Chemical derivative 979 0.0 +PEITC@K 163.04557 163.2395 H(9)C(9)N(1)S(1) 0.0 Chemical derivative 979 0.0 +PEITC@C 163.04557 163.2395 H(9)C(9)N(1)S(1) 0.0 Chemical derivative 979 0.0 +thioacylPA@K 159.035399 159.2062 H(9)C(6)N(1)O(2)S(1) 0.0 Chemical derivative 967 0.0 +maleimide3@K 969.366232 969.8975 H(59)C(37)N(7)O(23) 0.0 Post-translational 971 0.0 +maleimide3@C 969.366232 969.8975 H(59)C(37)N(7)O(23) 0.0 Post-translational 971 0.0 +maleimide5@K 1293.471879 1294.1787 H(79)C(49)N(7)O(33) 0.0 Post-translational 972 0.0 +maleimide5@C 1293.471879 1294.1787 H(79)C(49)N(7)O(33) 0.0 Post-translational 972 0.0 +Puromycin@Any_C-term 453.212452 453.4943 H(27)C(22)N(7)O(4) 0.0 Co-translational 973 0.0 +glucosone@R 160.037173 160.1247 H(8)C(6)O(5) 0.0 Other 981 0.0 +Label:13C(6)+Dimethyl@K 34.051429 34.0091 H(4)C(-4)13C(6) 0.0 Isotopic label 986 0.0 +cysTMT@C 299.166748 299.4322 H(25)C(14)N(3)O(2)S(1) 0.0 Chemical derivative 984 0.0 +cysTMT6plex@C 304.177202 304.3962 H(25)C(10)13C(4)N(2)15N(1)O(2)S(1) 0.0 Isotopic label 985 0.0 +ISD_z+2_ion@Any_N-term -15.010899 -15.0146 H(-1)N(-1) 0.0 Artefact 991 0.0 +Ammonium@E 17.026549 17.0305 H(3)N(1) 0.0 Artefact 989 0.0 +Ammonium@D 17.026549 17.0305 H(3)N(1) 0.0 Artefact 989 0.0 +Ammonium@Any_C-term 17.026549 17.0305 H(3)N(1) 0.0 Artefact 989 0.0 +Biotin:Sigma-B1267@C 449.17329 449.5239 H(27)C(20)N(5)O(5)S(1) 0.0 Chemical derivative 993 0.0 +Label:15N(1)@M 0.997035 0.9934 N(-1)15N(1) 0.0 Isotopic label 994 0.0 +Label:15N(1)@E 0.997035 0.9934 N(-1)15N(1) 0.0 Isotopic label 994 0.0 +Label:15N(1)@D 0.997035 0.9934 N(-1)15N(1) 0.0 Isotopic label 994 0.0 +Label:15N(1)@L 0.997035 0.9934 N(-1)15N(1) 0.0 Isotopic label 994 0.0 +Label:15N(1)@I 0.997035 0.9934 N(-1)15N(1) 0.0 Isotopic label 994 0.0 +Label:15N(1)@C 0.997035 0.9934 N(-1)15N(1) 0.0 Isotopic label 994 0.0 +Label:15N(1)@T 0.997035 0.9934 N(-1)15N(1) 0.0 Isotopic label 994 0.0 +Label:15N(1)@V 0.997035 0.9934 N(-1)15N(1) 0.0 Isotopic label 994 0.0 +Label:15N(1)@P 0.997035 0.9934 N(-1)15N(1) 0.0 Isotopic label 994 0.0 +Label:15N(1)@S 0.997035 0.9934 N(-1)15N(1) 0.0 Isotopic label 994 0.0 +Label:15N(1)@A 0.997035 0.9934 N(-1)15N(1) 0.0 Isotopic label 994 0.0 +Label:15N(1)@G 0.997035 0.9934 N(-1)15N(1) 0.0 Isotopic label 994 0.0 +Label:15N(1)@Y 0.997035 0.9934 N(-1)15N(1) 0.0 Isotopic label 994 0.0 +Label:15N(1)@F 0.997035 0.9934 N(-1)15N(1) 0.0 Isotopic label 994 0.0 +Label:15N(2)@W 1.99407 1.9868 N(-2)15N(2) 0.0 Isotopic label 995 0.0 +Label:15N(2)@K 1.99407 1.9868 N(-2)15N(2) 0.0 Isotopic label 995 0.0 +Label:15N(2)@Q 1.99407 1.9868 N(-2)15N(2) 0.0 Isotopic label 995 0.0 +Label:15N(2)@N 1.99407 1.9868 N(-2)15N(2) 0.0 Isotopic label 995 0.0 +Label:15N(3)@H 2.991105 2.9802 N(-3)15N(3) 0.0 Isotopic label 996 0.0 +sulfo+amino@Y 94.967714 95.0778 H(1)N(1)O(3)S(1) 0.0 Chemical derivative 997 0.0 +AHA-Alkyne@M 107.077339 107.0504 H(5)C(4)N(5)O(1)S(-1) 0.0 Chemical derivative 1000 0.0 +AHA-Alkyne-KDDDD@M 695.280074 695.5723 H(37)C(26)N(11)O(14)S(-1) 0.0 Chemical derivative 1001 0.0 +EGCG1@C 456.069261 456.3558 H(16)C(22)O(11) 0.0 Post-translational 1002 0.0 +EGCG2@C 287.055563 287.2442 H(11)C(15)O(6) 0.0 Post-translational 1003 0.0 +Label:13C(6)15N(4)+Methyl@R 24.023919 23.9561 H(2)C(-5)13C(6)N(-4)15N(4) 0.0 Isotopic label 1004 0.0 +Label:13C(6)15N(4)+Dimethyl@R 38.039569 37.9827 H(4)C(-4)13C(6)N(-4)15N(4) 0.0 Isotopic label 1005 0.0 +Label:13C(6)15N(4)+Methyl:2H(3)13C(1)@R 28.046104 27.9673 H(-1)2H(3)C(-6)13C(7)N(-4)15N(4) 0.0 Isotopic label 1006 0.0 +Label:13C(6)15N(4)+Dimethyl:2H(6)13C(2)@R 46.083939 46.005 H(-2)2H(6)C(-6)13C(8)N(-4)15N(4) 0.0 Isotopic label 1007 0.0 +Cys->CamSec@C 104.965913 103.9463 H(3)C(2)N(1)O(1)S(-1)Se(1) 0.0 Non-standard residue 1008 0.0 +Thiazolidine@W 12.0 12.0107 C(1) 0.0 Chemical derivative 1009 0.0 +Thiazolidine@Y 12.0 12.0107 C(1) 0.0 Chemical derivative 1009 0.0 +Thiazolidine@H 12.0 12.0107 C(1) 0.0 Chemical derivative 1009 0.0 +Thiazolidine@R 12.0 12.0107 C(1) 0.0 Chemical derivative 1009 0.0 +Thiazolidine@K 12.0 12.0107 C(1) 0.0 Chemical derivative 1009 0.0 +Thiazolidine@Protein_N-term 12.0 12.0107 C(1) 0.0 Chemical derivative 1009 0.0 +Thiazolidine@C 12.0 12.0107 C(1) 0.0 Chemical derivative 1009 0.0 +Thiazolidine@F 12.0 12.0107 C(1) 0.0 Chemical derivative 1009 0.0 +DEDGFLYMVYASQETFG@K 1970.824411 1972.088 H(122)C(89)N(18)O(31)S(1) 18.010565 H(2)O(1) Post-translational 1010 0.5 +Biotin:Invitrogen-M1602@C 523.210069 523.6024 H(33)C(23)N(5)O(7)S(1) 0.0 Chemical derivative 1012 0.0 +Xlink:DSS[156]@K 156.078644 156.1791 H(12)C(8)O(3) 0.0 Chemical derivative 1020 0.0 +Xlink:DSS[156]@Protein_N-term 156.078644 156.1791 H(12)C(8)O(3) 0.0 Chemical derivative 1020 0.0 +DMPO@H 111.068414 111.1418 H(9)C(6)N(1)O(1) 0.0 Post-translational 1017 0.0 +DMPO@Y 111.068414 111.1418 H(9)C(6)N(1)O(1) 0.0 Post-translational 1017 0.0 +DMPO@C 111.068414 111.1418 H(9)C(6)N(1)O(1) 0.0 Post-translational 1017 0.0 +glycidamide@K 87.032028 87.0773 H(5)C(3)N(1)O(2) 0.0 Chemical derivative 1014 0.0 +glycidamide@Any_N-term 87.032028 87.0773 H(5)C(3)N(1)O(2) 0.0 Chemical derivative 1014 0.0 +Ahx2+Hsl@Any_C-term 309.205242 309.4039 H(27)C(16)N(3)O(3) 0.0 Non-standard residue 1015 0.0 +ICDID@C 138.06808 138.1638 H(10)C(8)O(2) 0.0 Isotopic label 1018 0.0 +ICDID:2H(6)@C 144.10574 144.2008 H(4)2H(6)C(8)O(2) 0.0 Isotopic label 1019 0.0 +Xlink:EGS[244]@Protein_N-term 244.058303 244.1981 H(12)C(10)O(7) 0.0 Chemical derivative 1021 0.0 +Xlink:EGS[244]@K 244.058303 244.1981 H(12)C(10)O(7) 0.0 Chemical derivative 1021 0.0 +Xlink:DST[132]@Protein_N-term 132.005873 132.0716 H(4)C(4)O(5) 0.0 Chemical derivative 1022 0.0 +Xlink:DST[132]@K 132.005873 132.0716 H(4)C(4)O(5) 0.0 Chemical derivative 1022 0.0 +Xlink:DTSSP[192]@Protein_N-term 191.991486 192.2559 H(8)C(6)O(3)S(2) 0.0 Chemical derivative 1023 0.0 +Xlink:DTSSP[192]@K 191.991486 192.2559 H(8)C(6)O(3)S(2) 0.0 Chemical derivative 1023 0.0 +Xlink:SMCC[237]@C 237.100108 237.2518 H(15)C(12)N(1)O(4) 0.0 Chemical derivative 1024 0.0 +Xlink:SMCC[237]@K 237.100108 237.2518 H(15)C(12)N(1)O(4) 0.0 Chemical derivative 1024 0.0 +Xlink:SMCC[237]@Protein_N-term 237.100108 237.2518 H(15)C(12)N(1)O(4) 0.0 Chemical derivative 1024 0.0 +2-nitrobenzyl@Y 135.032028 135.1201 H(5)C(7)N(1)O(2) 0.0 Chemical derivative 1032 0.0 +Xlink:DMP[140]@Protein_N-term 140.094963 140.183 H(12)C(7)N(2)O(1) 0.0 Chemical derivative 1027 0.0 +Xlink:DMP[140]@K 140.094963 140.183 H(12)C(7)N(2)O(1) 0.0 Chemical derivative 1027 0.0 +Xlink:EGS[115]@Protein_N-term 115.026943 115.0874 H(5)C(4)N(1)O(3) 0.0 Chemical derivative 1028 0.0 +Xlink:EGS[115]@K 115.026943 115.0874 H(5)C(4)N(1)O(3) 0.0 Chemical derivative 1028 0.0 +Cys->SecNEM@C 172.992127 172.0203 H(7)C(6)N(1)O(2)S(-1)Se(1) 0.0 Non-standard residue 1033 0.0 +Cys->SecNEM:2H(5)@C 178.023511 177.0511 H(2)2H(5)C(6)N(1)O(2)S(-1)Se(1) 0.0 Chemical derivative 1034 0.0 +Thiadiazole@C 174.025169 174.2223 H(6)C(9)N(2)S(1) 0.0 Chemical derivative 1035 0.0 +Biotin:Thermo-88310@K 196.121178 196.2462 H(16)C(10)N(2)O(2) 0.0 Chemical derivative 1031 0.0 +TAMRA-FP@Y 659.312423 659.7514 H(46)C(37)N(3)O(6)P(1) 0.0 Chemical derivative 1038 0.0 +TAMRA-FP@S 659.312423 659.7514 H(46)C(37)N(3)O(6)P(1) 0.0 Chemical derivative 1038 0.0 +Biotin:Thermo-21901+H2O@C 543.236284 543.6336 H(37)C(23)N(5)O(8)S(1) 0.0 Chemical derivative 1039 0.0 +Deoxyhypusine@Q 71.073499 71.121 H(9)C(4)N(1) 0.0 Chemical derivative 1041 0.0 +Deoxyhypusine@K 71.073499 71.121 H(9)C(4)N(1) 0.0 Post-translational 1041 0.0 +Acetyldeoxyhypusine@K 113.084064 113.1576 H(11)C(6)N(1)O(1) 0.0 Post-translational 1042 0.0 +Acetylhypusine@K 129.078979 129.157 H(11)C(6)N(1)O(2) 0.0 Post-translational 1043 0.0 +Ala->Cys@A 31.972071 32.065 H(0)C(0)N(0)O(0)S(1) 0.0 AA substitution 1044 0.0 +Ala->Phe@A 76.0313 76.096 H(4)C(6)N(0)O(0)S(0) 0.0 AA substitution 1045 0.0 +Ala->His@A 66.021798 66.0614 H(2)C(3)N(2)O(0)S(0) 0.0 AA substitution 1046 0.0 +Ala->Xle@A 42.04695 42.0797 H(6)C(3) 0.0 AA substitution 1047 0.0 +Ala->Lys@A 57.057849 57.0944 H(7)C(3)N(1)O(0)S(0) 0.0 AA substitution 1048 0.0 +Ala->Met@A 60.003371 60.1182 H(4)C(2)N(0)O(0)S(1) 0.0 AA substitution 1049 0.0 +Ala->Asn@A 43.005814 43.0247 H(1)C(1)N(1)O(1)S(0) 0.0 AA substitution 1050 0.0 +Ala->Gln@A 57.021464 57.0513 H(3)C(2)N(1)O(1)S(0) 0.0 AA substitution 1051 0.0 +Ala->Arg@A 85.063997 85.1078 H(7)C(3)N(3)O(0)S(0) 0.0 AA substitution 1052 0.0 +Ala->Trp@A 115.042199 115.132 H(5)C(8)N(1)O(0)S(0) 0.0 AA substitution 1053 0.0 +Ala->Tyr@A 92.026215 92.0954 H(4)C(6)N(0)O(1)S(0) 0.0 AA substitution 1054 0.0 +Cys->Ala@C -31.972071 -32.065 H(0)C(0)N(0)O(0)S(-1) 0.0 AA substitution 1055 0.0 +Cys->Asp@C 12.017759 11.9445 H(0)C(1)N(0)O(2)S(-1) 0.0 AA substitution 1056 0.0 +Cys->Glu@C 26.033409 25.9711 H(2)C(2)N(0)O(2)S(-1) 0.0 AA substitution 1057 0.0 +Cys->His@C 34.049727 33.9964 H(2)C(3)N(2)O(0)S(-1) 0.0 AA substitution 1058 0.0 +Cys->Xle@C 10.07488 10.0147 H(6)C(3)S(-1) 0.0 AA substitution 1059 0.0 +Cys->Lys@C 25.085779 25.0294 H(7)C(3)N(1)O(0)S(-1) 0.0 AA substitution 1060 0.0 +Cys->Met@C 28.0313 28.0532 H(4)C(2)N(0)O(0)S(0) 0.0 AA substitution 1061 0.0 +Cys->Asn@C 11.033743 10.9597 H(1)C(1)N(1)O(1)S(-1) 0.0 AA substitution 1062 0.0 +Cys->Pro@C -5.956421 -6.0277 H(2)C(2)N(0)O(0)S(-1) 0.0 AA substitution 1063 0.0 +Cys->Gln@C 25.049393 24.9863 H(3)C(2)N(1)O(1)S(-1) 0.0 AA substitution 1064 0.0 +Cys->Thr@C -1.961506 -2.039 H(2)C(1)N(0)O(1)S(-1) 0.0 AA substitution 1065 0.0 +Cys->Val@C -3.940771 -4.0118 H(4)C(2)N(0)O(0)S(-1) 0.0 AA substitution 1066 0.0 +Asp->Cys@D -12.017759 -11.9445 H(0)C(-1)N(0)O(-2)S(1) 0.0 AA substitution 1067 0.0 +Asp->Phe@D 32.041471 32.0865 H(4)C(5)N(0)O(-2)S(0) 0.0 AA substitution 1068 0.0 +Asp->Xle@D -1.942879 -1.9298 H(6)C(2)O(-2) 0.0 AA substitution 1069 0.0 +Asp->Lys@D 13.06802 13.0849 H(7)C(2)N(1)O(-2)S(0) 0.0 AA substitution 1070 0.0 +Asp->Met@D 16.013542 16.1087 H(4)C(1)N(0)O(-2)S(1) 0.0 AA substitution 1071 0.0 +Asp->Pro@D -17.974179 -17.9722 H(2)C(1)N(0)O(-2)S(0) 0.0 AA substitution 1072 0.0 +Asp->Gln@D 13.031634 13.0418 H(3)C(1)N(1)O(-1)S(0) 0.0 AA substitution 1073 0.0 +Asp->Arg@D 41.074168 41.0983 H(7)C(2)N(3)O(-2)S(0) 0.0 AA substitution 1074 0.0 +Asp->Ser@D -27.994915 -28.0101 H(0)C(-1)N(0)O(-1)S(0) 0.0 AA substitution 1075 0.0 +Asp->Thr@D -13.979265 -13.9835 H(2)C(0)N(0)O(-1)S(0) 0.0 AA substitution 1076 0.0 +Asp->Trp@D 71.05237 71.1225 H(5)C(7)N(1)O(-2)S(0) 0.0 AA substitution 1077 0.0 +Glu->Cys@E -26.033409 -25.9711 H(-2)C(-2)N(0)O(-2)S(1) 0.0 AA substitution 1078 0.0 +Glu->Phe@E 18.025821 18.0599 H(2)C(4)N(0)O(-2)S(0) 0.0 AA substitution 1079 0.0 +Glu->His@E 8.016319 8.0253 H(0)C(1)N(2)O(-2)S(0) 0.0 AA substitution 1080 0.0 +Glu->Xle@E -15.958529 -15.9563 H(4)C(1)O(-2) 0.0 AA substitution 1081 0.0 +Glu->Met@E 1.997892 2.0821 H(2)C(0)N(0)O(-2)S(1) 0.0 AA substitution 1082 0.0 +Glu->Asn@E -14.999666 -15.0113 H(-1)C(-1)N(1)O(-1)S(0) 0.0 AA substitution 1083 0.0 +Glu->Pro@E -31.989829 -31.9988 H(0)C(0)N(0)O(-2)S(0) 0.0 AA substitution 1084 0.0 +Glu->Arg@E 27.058518 27.0717 H(5)C(1)N(3)O(-2)S(0) 0.0 AA substitution 1085 0.0 +Glu->Ser@E -42.010565 -42.0367 H(-2)C(-2)N(0)O(-1)S(0) 0.0 AA substitution 1086 0.0 +Glu->Thr@E -27.994915 -28.0101 H(0)C(-1)N(0)O(-1)S(0) 0.0 AA substitution 1087 0.0 +Glu->Trp@E 57.03672 57.0959 H(3)C(6)N(1)O(-2)S(0) 0.0 AA substitution 1088 0.0 +Glu->Tyr@E 34.020735 34.0593 H(2)C(4)N(0)O(-1)S(0) 0.0 AA substitution 1089 0.0 +Phe->Ala@F -76.0313 -76.096 H(-4)C(-6)N(0)O(0)S(0) 0.0 AA substitution 1090 0.0 +Phe->Asp@F -32.041471 -32.0865 H(-4)C(-5)N(0)O(2)S(0) 0.0 AA substitution 1091 0.0 +Phe->Glu@F -18.025821 -18.0599 H(-2)C(-4)N(0)O(2)S(0) 0.0 AA substitution 1092 0.0 +Phe->Gly@F -90.04695 -90.1225 H(-6)C(-7)N(0)O(0)S(0) 0.0 AA substitution 1093 0.0 +Phe->His@F -10.009502 -10.0346 H(-2)C(-3)N(2)O(0)S(0) 0.0 AA substitution 1094 0.0 +Phe->Lys@F -18.973451 -19.0016 H(3)C(-3)N(1)O(0)S(0) 0.0 AA substitution 1095 0.0 +Phe->Met@F -16.027929 -15.9778 H(0)C(-4)N(0)O(0)S(1) 0.0 AA substitution 1096 0.0 +Phe->Asn@F -33.025486 -33.0712 H(-3)C(-5)N(1)O(1)S(0) 0.0 AA substitution 1097 0.0 +Phe->Pro@F -50.01565 -50.0587 H(-2)C(-4)N(0)O(0)S(0) 0.0 AA substitution 1098 0.0 +Phe->Gln@F -19.009836 -19.0446 H(-1)C(-4)N(1)O(1)S(0) 0.0 AA substitution 1099 0.0 +Phe->Arg@F 9.032697 9.0118 H(3)C(-3)N(3)O(0)S(0) 0.0 AA substitution 1100 0.0 +Phe->Thr@F -46.020735 -46.07 H(-2)C(-5)N(0)O(1)S(0) 0.0 AA substitution 1101 0.0 +Phe->Trp@F 39.010899 39.036 H(1)C(2)N(1)O(0)S(0) 0.0 AA substitution 1102 0.0 +Gly->Phe@G 90.04695 90.1225 H(6)C(7)N(0)O(0)S(0) 0.0 AA substitution 1103 0.0 +Gly->His@G 80.037448 80.088 H(4)C(4)N(2)O(0)S(0) 0.0 AA substitution 1104 0.0 +Gly->Xle@G 56.0626 56.1063 H(8)C(4) 0.0 AA substitution 1105 0.0 +Gly->Lys@G 71.073499 71.121 H(9)C(4)N(1)O(0)S(0) 0.0 AA substitution 1106 0.0 +Gly->Met@G 74.019021 74.1447 H(6)C(3)N(0)O(0)S(1) 0.0 AA substitution 1107 0.0 +Gly->Asn@G 57.021464 57.0513 H(3)C(2)N(1)O(1)S(0) 0.0 AA substitution 1108 0.0 +Gly->Pro@G 40.0313 40.0639 H(4)C(3)N(0)O(0)S(0) 0.0 AA substitution 1109 0.0 +Gly->Gln@G 71.037114 71.0779 H(5)C(3)N(1)O(1)S(0) 0.0 AA substitution 1110 0.0 +Gly->Thr@G 44.026215 44.0526 H(4)C(2)N(0)O(1)S(0) 0.0 AA substitution 1111 0.0 +Gly->Tyr@G 106.041865 106.1219 H(6)C(7)N(0)O(1)S(0) 0.0 AA substitution 1112 0.0 +His->Ala@H -66.021798 -66.0614 H(-2)C(-3)N(-2)O(0)S(0) 0.0 AA substitution 1113 0.0 +His->Cys@H -34.049727 -33.9964 H(-2)C(-3)N(-2)O(0)S(1) 0.0 AA substitution 1114 0.0 +His->Glu@H -8.016319 -8.0253 H(0)C(-1)N(-2)O(2)S(0) 0.0 AA substitution 1115 0.0 +His->Phe@H 10.009502 10.0346 H(2)C(3)N(-2)O(0)S(0) 0.0 AA substitution 1116 0.0 +His->Gly@H -80.037448 -80.088 H(-4)C(-4)N(-2)O(0)S(0) 0.0 AA substitution 1117 0.0 +His->Lys@H -8.963949 -8.967 H(5)C(0)N(-1)O(0)S(0) 0.0 AA substitution 1119 0.0 +His->Met@H -6.018427 -5.9432 H(2)C(-1)N(-2)O(0)S(1) 0.0 AA substitution 1120 0.0 +His->Ser@H -50.026883 -50.062 H(-2)C(-3)N(-2)O(1)S(0) 0.0 AA substitution 1121 0.0 +His->Thr@H -36.011233 -36.0354 H(0)C(-2)N(-2)O(1)S(0) 0.0 AA substitution 1122 0.0 +His->Val@H -37.990498 -38.0082 H(2)C(-1)N(-2)O(0)S(0) 0.0 AA substitution 1123 0.0 +His->Trp@H 49.020401 49.0706 H(3)C(5)N(-1)O(0)S(0) 0.0 AA substitution 1124 0.0 +Xle->Cys@L -10.07488 -10.0147 H(-6)C(-3)N(0)O(0)S(1) 0.0 AA substitution 1126 0.0 +Xle->Cys@I -10.07488 -10.0147 H(-6)C(-3)N(0)O(0)S(1) 0.0 AA substitution 1126 0.0 +Xle->Asp@L 1.942879 1.9298 H(-6)C(-2)N(0)O(2)S(0) 0.0 AA substitution 1127 0.0 +Xle->Asp@I 1.942879 1.9298 H(-6)C(-2)N(0)O(2)S(0) 0.0 AA substitution 1127 0.0 +Xle->Glu@L 15.958529 15.9563 H(-4)C(-1)N(0)O(2)S(0) 0.0 AA substitution 1128 0.0 +Xle->Glu@I 15.958529 15.9563 H(-4)C(-1)N(0)O(2)S(0) 0.0 AA substitution 1128 0.0 +Xle->Gly@L -56.0626 -56.1063 H(-8)C(-4)N(0)O(0)S(0) 0.0 AA substitution 1129 0.0 +Xle->Gly@I -56.0626 -56.1063 H(-8)C(-4)N(0)O(0)S(0) 0.0 AA substitution 1129 0.0 +Xle->Tyr@L 49.979265 50.0156 H(-2)C(3)N(0)O(1)S(0) 0.0 AA substitution 1130 0.0 +Xle->Tyr@I 49.979265 50.0156 H(-2)C(3)N(0)O(1)S(0) 0.0 AA substitution 1130 0.0 +Lys->Ala@K -57.057849 -57.0944 H(-7)C(-3)N(-1)O(0)S(0) 0.0 AA substitution 1131 0.0 +Lys->Cys@K -25.085779 -25.0294 H(-7)C(-3)N(-1)O(0)S(1) 0.0 AA substitution 1132 0.0 +Lys->Asp@K -13.06802 -13.0849 H(-7)C(-2)N(-1)O(2)S(0) 0.0 AA substitution 1133 0.0 +Lys->Phe@K 18.973451 19.0016 H(-3)C(3)N(-1)O(0)S(0) 0.0 AA substitution 1134 0.0 +Lys->Gly@K -71.073499 -71.121 H(-9)C(-4)N(-1)O(0)S(0) 0.0 AA substitution 1135 0.0 +Lys->His@K 8.963949 8.967 H(-5)C(0)N(1)O(0)S(0) 0.0 AA substitution 1136 0.0 +Lys->Pro@K -31.042199 -31.0571 H(-5)C(-1)N(-1)O(0)S(0) 0.0 AA substitution 1137 0.0 +Lys->Ser@K -41.062935 -41.095 H(-7)C(-3)N(-1)O(1)S(0) 0.0 AA substitution 1138 0.0 +Lys->Val@K -29.026549 -29.0412 H(-3)C(-1)N(-1)O(0)S(0) 0.0 AA substitution 1139 0.0 +Lys->Trp@K 57.98435 58.0376 H(-2)C(5)N(0)O(0)S(0) 0.0 AA substitution 1140 0.0 +Lys->Tyr@K 34.968366 35.001 H(-3)C(3)N(-1)O(1)S(0) 0.0 AA substitution 1141 0.0 +Met->Ala@M -60.003371 -60.1182 H(-4)C(-2)N(0)O(0)S(-1) 0.0 AA substitution 1142 0.0 +Met->Cys@M -28.0313 -28.0532 H(-4)C(-2)N(0)O(0)S(0) 0.0 AA substitution 1143 0.0 +Met->Asp@M -16.013542 -16.1087 H(-4)C(-1)N(0)O(2)S(-1) 0.0 AA substitution 1144 0.0 +Met->Glu@M -1.997892 -2.0821 H(-2)C(0)N(0)O(2)S(-1) 0.0 AA substitution 1145 0.0 +Met->Phe@M 16.027929 15.9778 H(0)C(4)N(0)O(0)S(-1) 0.0 AA substitution 1146 0.0 +Met->Gly@M -74.019021 -74.1447 H(-6)C(-3)N(0)O(0)S(-1) 0.0 AA substitution 1147 0.0 +Met->His@M 6.018427 5.9432 H(-2)C(1)N(2)O(0)S(-1) 0.0 AA substitution 1148 0.0 +Met->Asn@M -16.997557 -17.0934 H(-3)C(-1)N(1)O(1)S(-1) 0.0 AA substitution 1149 0.0 +Met->Pro@M -33.987721 -34.0809 H(-2)C(0)N(0)O(0)S(-1) 0.0 AA substitution 1150 0.0 +Met->Gln@M -2.981907 -3.0668 H(-1)C(0)N(1)O(1)S(-1) 0.0 AA substitution 1151 0.0 +Met->Ser@M -44.008456 -44.1188 H(-4)C(-2)N(0)O(1)S(-1) 0.0 AA substitution 1152 0.0 +Met->Trp@M 55.038828 55.0138 H(1)C(6)N(1)O(0)S(-1) 0.0 AA substitution 1153 0.0 +Met->Tyr@M 32.022844 31.9772 H(0)C(4)N(0)O(1)S(-1) 0.0 AA substitution 1154 0.0 +Asn->Ala@N -43.005814 -43.0247 H(-1)C(-1)N(-1)O(-1)S(0) 0.0 AA substitution 1155 0.0 +Asn->Cys@N -11.033743 -10.9597 H(-1)C(-1)N(-1)O(-1)S(1) 0.0 AA substitution 1156 0.0 +Asn->Glu@N 14.999666 15.0113 H(1)C(1)N(-1)O(1)S(0) 0.0 AA substitution 1157 0.0 +Asn->Phe@N 33.025486 33.0712 H(3)C(5)N(-1)O(-1)S(0) 0.0 AA substitution 1158 0.0 +Asn->Gly@N -57.021464 -57.0513 H(-3)C(-2)N(-1)O(-1)S(0) 0.0 AA substitution 1159 0.0 +Asn->Met@N 16.997557 17.0934 H(3)C(1)N(-1)O(-1)S(1) 0.0 AA substitution 1160 0.0 +Asn->Pro@N -16.990164 -16.9875 H(1)C(1)N(-1)O(-1)S(0) 0.0 AA substitution 1161 0.0 +Asn->Gln@N 14.01565 14.0266 H(2)C(1)N(0)O(0)S(0) 0.0 AA substitution 1162 0.0 +Asn->Arg@N 42.058184 42.083 H(6)C(2)N(2)O(-1)S(0) 0.0 AA substitution 1163 0.0 +Asn->Val@N -14.974514 -14.9716 H(3)C(1)N(-1)O(-1)S(0) 0.0 AA substitution 1164 0.0 +Asn->Trp@N 72.036386 72.1073 H(4)C(7)N(0)O(-1)S(0) 0.0 AA substitution 1165 0.0 +Pro->Cys@P 5.956421 6.0277 H(-2)C(-2)N(0)O(0)S(1) 0.0 AA substitution 1166 0.0 +Pro->Asp@P 17.974179 17.9722 H(-2)C(-1)N(0)O(2)S(0) 0.0 AA substitution 1167 0.0 +Pro->Glu@P 31.989829 31.9988 H(0)C(0)N(0)O(2)S(0) 0.0 AA substitution 1168 0.0 +Pro->Phe@P 50.01565 50.0587 H(2)C(4)N(0)O(0)S(0) 0.0 AA substitution 1169 0.0 +Pro->Gly@P -40.0313 -40.0639 H(-4)C(-3)N(0)O(0)S(0) 0.0 AA substitution 1170 0.0 +Pro->Lys@P 31.042199 31.0571 H(5)C(1)N(1)O(0)S(0) 0.0 AA substitution 1171 0.0 +Pro->Met@P 33.987721 34.0809 H(2)C(0)N(0)O(0)S(1) 0.0 AA substitution 1172 0.0 +Pro->Asn@P 16.990164 16.9875 H(-1)C(-1)N(1)O(1)S(0) 0.0 AA substitution 1173 0.0 +Pro->Val@P 2.01565 2.0159 H(2)C(0)N(0)O(0)S(0) 0.0 AA substitution 1174 0.0 +Pro->Trp@P 89.026549 89.0947 H(3)C(6)N(1)O(0)S(0) 0.0 AA substitution 1175 0.0 +Pro->Tyr@P 66.010565 66.0581 H(2)C(4)N(0)O(1)S(0) 0.0 AA substitution 1176 0.0 +Gln->Ala@Q -57.021464 -57.0513 H(-3)C(-2)N(-1)O(-1)S(0) 0.0 AA substitution 1177 0.0 +Gln->Cys@Q -25.049393 -24.9863 H(-3)C(-2)N(-1)O(-1)S(1) 0.0 AA substitution 1178 0.0 +Gln->Asp@Q -13.031634 -13.0418 H(-3)C(-1)N(-1)O(1)S(0) 0.0 AA substitution 1179 0.0 +Gln->Phe@Q 19.009836 19.0446 H(1)C(4)N(-1)O(-1)S(0) 0.0 AA substitution 1180 0.0 +Gln->Gly@Q -71.037114 -71.0779 H(-5)C(-3)N(-1)O(-1)S(0) 0.0 AA substitution 1181 0.0 +Gln->Met@Q 2.981907 3.0668 H(1)C(0)N(-1)O(-1)S(1) 0.0 AA substitution 1182 0.0 +Gln->Asn@Q -14.01565 -14.0266 H(-2)C(-1)N(0)O(0)S(0) 0.0 AA substitution 1183 0.0 +Gln->Ser@Q -41.026549 -41.0519 H(-3)C(-2)N(-1)O(0)S(0) 0.0 AA substitution 1184 0.0 +Gln->Thr@Q -27.010899 -27.0253 H(-1)C(-1)N(-1)O(0)S(0) 0.0 AA substitution 1185 0.0 +Gln->Val@Q -28.990164 -28.9982 H(1)C(0)N(-1)O(-1)S(0) 0.0 AA substitution 1186 0.0 +Gln->Trp@Q 58.020735 58.0807 H(2)C(6)N(0)O(-1)S(0) 0.0 AA substitution 1187 0.0 +Gln->Tyr@Q 35.004751 35.044 H(1)C(4)N(-1)O(0)S(0) 0.0 AA substitution 1188 0.0 +Arg->Ala@R -85.063997 -85.1078 H(-7)C(-3)N(-3)O(0)S(0) 0.0 AA substitution 1189 0.0 +Arg->Asp@R -41.074168 -41.0983 H(-7)C(-2)N(-3)O(2)S(0) 0.0 AA substitution 1190 0.0 +Arg->Glu@R -27.058518 -27.0717 H(-5)C(-1)N(-3)O(2)S(0) 0.0 AA substitution 1191 0.0 +Arg->Asn@R -42.058184 -42.083 H(-6)C(-2)N(-2)O(1)S(0) 0.0 AA substitution 1192 0.0 +Arg->Val@R -57.032697 -57.0546 H(-3)C(-1)N(-3)O(0)S(0) 0.0 AA substitution 1193 0.0 +Arg->Tyr@R 6.962218 6.9876 H(-3)C(3)N(-3)O(1)S(0) 0.0 AA substitution 1194 0.0 +Arg->Phe@R -9.032697 -9.0118 H(-3)C(3)N(-3) 0.0 AA substitution 1195 0.0 +Ser->Asp@S 27.994915 28.0101 H(0)C(1)N(0)O(1)S(0) 0.0 AA substitution 1196 0.0 +Ser->Glu@S 42.010565 42.0367 H(2)C(2)N(0)O(1)S(0) 0.0 AA substitution 1197 0.0 +Ser->His@S 50.026883 50.062 H(2)C(3)N(2)O(-1)S(0) 0.0 AA substitution 1198 0.0 +Ser->Lys@S 41.062935 41.095 H(7)C(3)N(1)O(-1)S(0) 0.0 AA substitution 1199 0.0 +Ser->Met@S 44.008456 44.1188 H(4)C(2)N(0)O(-1)S(1) 0.0 AA substitution 1200 0.0 +Ser->Gln@S 41.026549 41.0519 H(3)C(2)N(1)O(0)S(0) 0.0 AA substitution 1201 0.0 +Ser->Val@S 12.036386 12.0538 H(4)C(2)N(0)O(-1)S(0) 0.0 AA substitution 1202 0.0 +Thr->Cys@T 1.961506 2.039 H(-2)C(-1)N(0)O(-1)S(1) 0.0 AA substitution 1203 0.0 +Thr->Asp@T 13.979265 13.9835 H(-2)C(0)N(0)O(1)S(0) 0.0 AA substitution 1204 0.0 +Thr->Glu@T 27.994915 28.0101 H(0)C(1)N(0)O(1)S(0) 0.0 AA substitution 1205 0.0 +Thr->Phe@T 46.020735 46.07 H(2)C(5)N(0)O(-1)S(0) 0.0 AA substitution 1206 0.0 +Thr->Gly@T -44.026215 -44.0526 H(-4)C(-2)N(0)O(-1)S(0) 0.0 AA substitution 1207 0.0 +Thr->His@T 36.011233 36.0354 H(0)C(2)N(2)O(-1)S(0) 0.0 AA substitution 1208 0.0 +Thr->Gln@T 27.010899 27.0253 H(1)C(1)N(1)O(0)S(0) 0.0 AA substitution 1209 0.0 +Thr->Val@T -1.979265 -1.9728 H(2)C(1)N(0)O(-1)S(0) 0.0 AA substitution 1210 0.0 +Thr->Trp@T 85.031634 85.106 H(3)C(7)N(1)O(-1)S(0) 0.0 AA substitution 1211 0.0 +Thr->Tyr@T 62.01565 62.0694 H(2)C(5)N(0)O(0)S(0) 0.0 AA substitution 1212 0.0 +Val->Cys@V 3.940771 4.0118 H(-4)C(-2)N(0)O(0)S(1) 0.0 AA substitution 1213 0.0 +Val->His@V 37.990498 38.0082 H(-2)C(1)N(2)O(0)S(0) 0.0 AA substitution 1214 0.0 +Val->Lys@V 29.026549 29.0412 H(3)C(1)N(1)O(0)S(0) 0.0 AA substitution 1215 0.0 +Val->Asn@V 14.974514 14.9716 H(-3)C(-1)N(1)O(1)S(0) 0.0 AA substitution 1216 0.0 +Val->Pro@V -2.01565 -2.0159 H(-2)C(0)N(0)O(0)S(0) 0.0 AA substitution 1217 0.0 +Val->Gln@V 28.990164 28.9982 H(-1)C(0)N(1)O(1)S(0) 0.0 AA substitution 1218 0.0 +Val->Arg@V 57.032697 57.0546 H(3)C(1)N(3)O(0)S(0) 0.0 AA substitution 1219 0.0 +Val->Ser@V -12.036386 -12.0538 H(-4)C(-2)N(0)O(1)S(0) 0.0 AA substitution 1220 0.0 +Val->Thr@V 1.979265 1.9728 H(-2)C(-1)N(0)O(1)S(0) 0.0 AA substitution 1221 0.0 +Val->Trp@V 87.010899 87.0788 H(1)C(6)N(1)O(0)S(0) 0.0 AA substitution 1222 0.0 +Val->Tyr@V 63.994915 64.0422 H(0)C(4)N(0)O(1)S(0) 0.0 AA substitution 1223 0.0 +Trp->Ala@W -115.042199 -115.132 H(-5)C(-8)N(-1)O(0)S(0) 0.0 AA substitution 1224 0.0 +Trp->Asp@W -71.05237 -71.1225 H(-5)C(-7)N(-1)O(2)S(0) 0.0 AA substitution 1225 0.0 +Trp->Glu@W -57.03672 -57.0959 H(-3)C(-6)N(-1)O(2)S(0) 0.0 AA substitution 1226 0.0 +Trp->Phe@W -39.010899 -39.036 H(-1)C(-2)N(-1)O(0)S(0) 0.0 AA substitution 1227 0.0 +Trp->His@W -49.020401 -49.0706 H(-3)C(-5)N(1)O(0)S(0) 0.0 AA substitution 1228 0.0 +Trp->Lys@W -57.98435 -58.0376 H(2)C(-5)N(0)O(0)S(0) 0.0 AA substitution 1229 0.0 +Trp->Met@W -55.038828 -55.0138 H(-1)C(-6)N(-1)O(0)S(1) 0.0 AA substitution 1230 0.0 +Trp->Asn@W -72.036386 -72.1073 H(-4)C(-7)N(0)O(1)S(0) 0.0 AA substitution 1231 0.0 +Trp->Pro@W -89.026549 -89.0947 H(-3)C(-6)N(-1)O(0)S(0) 0.0 AA substitution 1232 0.0 +Trp->Gln@W -58.020735 -58.0807 H(-2)C(-6)N(0)O(1)S(0) 0.0 AA substitution 1233 0.0 +Trp->Thr@W -85.031634 -85.106 H(-3)C(-7)N(-1)O(1)S(0) 0.0 AA substitution 1234 0.0 +Trp->Val@W -87.010899 -87.0788 H(-1)C(-6)N(-1)O(0)S(0) 0.0 AA substitution 1235 0.0 +Trp->Tyr@W -23.015984 -23.0366 H(-1)C(-2)N(-1)O(1)S(0) 0.0 AA substitution 1236 0.0 +Tyr->Ala@Y -92.026215 -92.0954 H(-4)C(-6)N(0)O(-1)S(0) 0.0 AA substitution 1237 0.0 +Tyr->Glu@Y -34.020735 -34.0593 H(-2)C(-4)N(0)O(1)S(0) 0.0 AA substitution 1238 0.0 +Tyr->Gly@Y -106.041865 -106.1219 H(-6)C(-7)N(0)O(-1)S(0) 0.0 AA substitution 1239 0.0 +Tyr->Lys@Y -34.968366 -35.001 H(3)C(-3)N(1)O(-1)S(0) 0.0 AA substitution 1240 0.0 +Tyr->Met@Y -32.022844 -31.9772 H(0)C(-4)N(0)O(-1)S(1) 0.0 AA substitution 1241 0.0 +Tyr->Pro@Y -66.010565 -66.0581 H(-2)C(-4)N(0)O(-1)S(0) 0.0 AA substitution 1242 0.0 +Tyr->Gln@Y -35.004751 -35.044 H(-1)C(-4)N(1)O(0)S(0) 0.0 AA substitution 1243 0.0 +Tyr->Arg@Y -6.962218 -6.9876 H(3)C(-3)N(3)O(-1)S(0) 0.0 AA substitution 1244 0.0 +Tyr->Thr@Y -62.01565 -62.0694 H(-2)C(-5)N(0)O(0)S(0) 0.0 AA substitution 1245 0.0 +Tyr->Val@Y -63.994915 -64.0422 H(0)C(-4)N(0)O(-1)S(0) 0.0 AA substitution 1246 0.0 +Tyr->Trp@Y 23.015984 23.0366 H(1)C(2)N(1)O(-1)S(0) 0.0 AA substitution 1247 0.0 +Tyr->Xle@Y -49.979265 -50.0156 H(2)C(-3)O(-1) 0.0 AA substitution 1248 0.0 +AHA-SS@M 195.075625 195.1787 H(9)C(7)N(5)O(2) 0.0 Multiple 1249 0.0 +AHA-SS_CAM@M 252.097088 252.23 H(12)C(9)N(6)O(3) 0.0 Multiple 1250 0.0 +Biotin:Thermo-33033@Anywhere 548.223945 548.7211 H(36)C(25)N(6)O(4)S(2) 0.0 Chemical derivative 1251 0.0 +Biotin:Thermo-33033-H@Anywhere 546.208295 546.7053 H(34)C(25)N(6)O(4)S(2) 0.0 Chemical derivative 1252 0.0 +2-monomethylsuccinyl@C 130.026609 130.0987 H(6)C(5)O(4) 0.0 Chemical derivative 1253 0.0 +Saligenin@H 106.041865 106.1219 H(6)C(7)O(1) 0.0 Chemical derivative 1254 0.0 +Saligenin@K 106.041865 106.1219 H(6)C(7)O(1) 0.0 Chemical derivative 1254 0.0 +Cresylphosphate@R 170.013281 170.1024 H(7)C(7)O(3)P(1) 0.0 Chemical derivative 1255 0.0 +Cresylphosphate@S 170.013281 170.1024 H(7)C(7)O(3)P(1) 0.0 Chemical derivative 1255 0.0 +Cresylphosphate@T 170.013281 170.1024 H(7)C(7)O(3)P(1) 0.0 Chemical derivative 1255 0.0 +Cresylphosphate@Y 170.013281 170.1024 H(7)C(7)O(3)P(1) 0.0 Chemical derivative 1255 0.0 +Cresylphosphate@K 170.013281 170.1024 H(7)C(7)O(3)P(1) 0.0 Chemical derivative 1255 0.0 +Cresylphosphate@H 170.013281 170.1024 H(7)C(7)O(3)P(1) 0.0 Chemical derivative 1255 0.0 +CresylSaligeninPhosphate@R 276.055146 276.2244 H(13)C(14)O(4)P(1) 0.0 Chemical derivative 1256 0.0 +CresylSaligeninPhosphate@S 276.055146 276.2244 H(13)C(14)O(4)P(1) 0.0 Chemical derivative 1256 0.0 +CresylSaligeninPhosphate@T 276.055146 276.2244 H(13)C(14)O(4)P(1) 0.0 Chemical derivative 1256 0.0 +CresylSaligeninPhosphate@Y 276.055146 276.2244 H(13)C(14)O(4)P(1) 0.0 Chemical derivative 1256 0.0 +CresylSaligeninPhosphate@K 276.055146 276.2244 H(13)C(14)O(4)P(1) 0.0 Chemical derivative 1256 0.0 +CresylSaligeninPhosphate@H 276.055146 276.2244 H(13)C(14)O(4)P(1) 0.0 Chemical derivative 1256 0.0 +Ub-Br2@C 100.063663 100.1191 H(8)C(4)N(2)O(1) 0.0 Chemical derivative 1257 0.0 +Ub-VME@C 172.084792 172.1818 H(12)C(7)N(2)O(3) 0.0 Chemical derivative 1258 0.0 +Ub-fluorescein@C 597.209772 597.598 H(29)C(31)N(6)O(7) 0.0 Chemical derivative 1261 0.0 +2-dimethylsuccinyl@C 144.042259 144.1253 H(8)C(6)O(4) 0.0 Chemical derivative 1262 0.0 +Gly@T 57.021464 57.0513 H(3)C(2)N(1)O(1) 0.0 Chemical derivative 1263 0.0 +Gly@S 57.021464 57.0513 H(3)C(2)N(1)O(1) 0.0 Chemical derivative 1263 0.0 +Gly@K 57.021464 57.0513 H(3)C(2)N(1)O(1) 0.0 Chemical derivative 1263 0.0 +pupylation@K 243.085521 243.2166 H(13)C(9)N(3)O(5) 0.0 Post-translational 1264 0.0 +Label:13C(4)@M 4.013419 3.9706 C(-4)13C(4) 0.0 Isotopic label 1266 0.0 +HCysteinyl@C 133.019749 133.1689 H(7)C(4)N(1)O(2)S(1) 0.0 Post-translational 1271 0.0 +Label:13C(4)+Oxidation@M 20.008334 19.97 C(-4)13C(4)O(1) 0.0 Isotopic label 1267 0.0 +UgiJoullie@E 1106.48935 1107.1274 H(60)C(47)N(23)O(10) 0.0 Chemical derivative 1276 0.0 +UgiJoullie@D 1106.48935 1107.1274 H(60)C(47)N(23)O(10) 0.0 Chemical derivative 1276 0.0 +HCysThiolactone@K 117.024835 117.1695 H(7)C(4)N(1)O(1)S(1) 0.0 Post-translational 1270 0.0 +UgiJoullieProGly@D 154.074228 154.1665 H(10)C(7)N(2)O(2) 0.0 Chemical derivative 1282 0.0 +UgiJoullieProGly@E 154.074228 154.1665 H(10)C(7)N(2)O(2) 0.0 Chemical derivative 1282 0.0 +Dipyridyl@C 225.090212 225.2459 H(11)C(13)N(3)O(1) 0.0 Chemical derivative 1277 0.0 +Furan@Y 66.010565 66.0581 H(2)C(4)O(1) 0.0 Chemical derivative 1278 0.0 +Difuran@Y 132.021129 132.1162 H(4)C(8)O(2) 0.0 Chemical derivative 1279 0.0 +BMP-piperidinol@C 263.131014 263.3337 H(17)C(18)N(1)O(1) 0.0 Chemical derivative 1281 0.0 +BMP-piperidinol@M 263.131014 263.3337 H(17)C(18)N(1)O(1) 0.0 Chemical derivative 1281 0.0 +UgiJoullieProGlyProGly@D 308.148455 308.333 H(20)C(14)N(4)O(4) 0.0 Chemical derivative 1283 0.0 +UgiJoullieProGlyProGly@E 308.148455 308.333 H(20)C(14)N(4)O(4) 0.0 Chemical derivative 1283 0.0 +Arg-loss@R^Any_C-term -156.101111 -156.1857 H(-12)C(-6)N(-4)O(-1) 0.0 Other 1287 0.0 +Arg@Any_N-term 156.101111 156.1857 H(12)C(6)N(4)O(1) 0.0 Other 1288 0.0 +IMEHex(2)NeuAc(1)@K 688.199683 688.6527 H(40)C(25)N(2)O(18)S(1) 0.0 Other glycosylation 1286 0.0 +Butyryl@K 70.041865 70.0898 H(6)C(4)O(1) 0.0 Post-translational 1289 CCCC(=O)NCCCCC(C(=O)[Rn])N([Xe])([Xe]) 0.0 +Dicarbamidomethyl@K 114.042927 114.1026 H(6)C(4)N(2)O(2) 0.0 Artefact 1290 0.0 +Dicarbamidomethyl@H 114.042927 114.1026 H(6)C(4)N(2)O(2) 0.0 Artefact 1290 0.0 +Dicarbamidomethyl@C 114.042927 114.1026 H(6)C(4)N(2)O(2) 0.0 Artefact 1290 0.0 +Dicarbamidomethyl@R 114.042927 114.1026 H(6)C(4)N(2)O(2) 0.0 Artefact 1290 0.0 +Dicarbamidomethyl@Any_N-term 114.042927 114.1026 H(6)C(4)N(2)O(2) 0.0 Artefact 1290 0.0 +Dimethyl:2H(6)@K 34.068961 34.0901 H(-2)2H(6)C(2) 0.0 Isotopic label 1291 0.0 +Dimethyl:2H(6)@Any_N-term 34.068961 34.0901 H(-2)2H(6)C(2) 0.0 Isotopic label 1291 0.0 +Dimethyl:2H(6)@R 34.068961 34.0901 H(-2)2H(6)C(2) 0.0 Isotopic label 1291 0.0 +GGQ@K 242.101505 242.2319 H(14)C(9)N(4)O(4) 0.0 Other 1292 0.0 +QTGG@K 343.149184 343.3357 H(21)C(13)N(5)O(6) 0.0 Other 1293 0.0 +Label:13C(3)15N(1)@A 4.007099 3.9714 C(-3)13C(3)N(-1)15N(1) 0.0 Isotopic label 1297 0.0 +Label:13C(3)15N(1)@S 4.007099 3.9714 C(-3)13C(3)N(-1)15N(1) 0.0 Isotopic label 1297 0.0 +Label:13C(3)@A 3.010064 2.978 C(-3)13C(3) 0.0 Isotopic label 1296 0.0 +Label:13C(4)15N(1)@D 5.010454 4.964 C(-4)13C(4)N(-1)15N(1) 0.0 Isotopic label 1298 0.0 +Label:2H(10)@L 10.062767 10.0616 H(-10)2H(10) 0.0 Isotopic label 1299 0.0 +Label:2H(4)13C(1)@R 5.028462 5.0173 H(-4)2H(4)C(-1)13C(1) 0.0 Isotopic label 1300 0.0 +Lys@Any_N-term 128.094963 128.1723 H(12)C(6)N(2)O(1) 0.0 Other 1301 0.0 +mTRAQ:13C(6)15N(2)@K 148.109162 148.1257 H(12)C(1)13C(6)15N(2)O(1) 0.0 Isotopic label 1302 [H]N(CCCC[C@H](N([Xe])([Xe]))C(=O)[Rn])C(=O)[13CH2][15N]1[13CH2][13CH2][15N]([13CH3])[13CH2][13CH2]1 0.0 +mTRAQ:13C(6)15N(2)@Any_N-term 148.109162 148.1257 H(12)C(1)13C(6)15N(2)O(1) 0.0 Isotopic label 1302 C(=O)[13C]([H])([H])[15N]1[13C]([H])([H])[13C]([H])([H])[15N]([13C]([H])([H])[13C]1([H])([H]))[13C]([H])([H])([H]) 0.0 +mTRAQ:13C(6)15N(2)@Y 148.109162 148.1257 H(12)C(1)13C(6)15N(2)O(1) 0.0 Isotopic label 1302 0.0 +mTRAQ:13C(6)15N(2)@H 148.109162 148.1257 H(12)C(1)13C(6)15N(2)O(1) 0.0 Isotopic label 1302 0.0 +mTRAQ:13C(6)15N(2)@S 148.109162 148.1257 H(12)C(1)13C(6)15N(2)O(1) 0.0 Isotopic label 1302 0.0 +mTRAQ:13C(6)15N(2)@T 148.109162 148.1257 H(12)C(1)13C(6)15N(2)O(1) 0.0 Isotopic label 1302 0.0 +NeuAc@T 291.095417 291.2546 H(17)C(11)N(1)O(8) 291.095417 H(17)C(11)N(1)O(8) O-linked glycosylation 1303 0.5 +NeuAc@S 291.095417 291.2546 H(17)C(11)N(1)O(8) 291.095417 H(17)C(11)N(1)O(8) O-linked glycosylation 1303 0.5 +NeuAc@N 291.095417 291.2546 H(17)C(11)N(1)O(8) 291.095417 H(17)C(11)N(1)O(8) N-linked glycosylation 1303 0.5 +NeuGc@T 307.090331 307.254 H(17)C(11)N(1)O(9) 307.090331 H(17)C(11)N(1)O(9) O-linked glycosylation 1304 0.5 +NeuGc@S 307.090331 307.254 H(17)C(11)N(1)O(9) 307.090331 H(17)C(11)N(1)O(9) O-linked glycosylation 1304 0.5 +NeuGc@N 307.090331 307.254 H(17)C(11)N(1)O(9) 307.090331 H(17)C(11)N(1)O(9) N-linked glycosylation 1304 0.5 +Propyl@D 42.04695 42.0797 H(6)C(3) 0.0 Chemical derivative 1305 0.0 +Propyl@K 42.04695 42.0797 H(6)C(3) 0.0 Isotopic label 1305 0.0 +Propyl@Any_N-term 42.04695 42.0797 H(6)C(3) 0.0 Isotopic label 1305 0.0 +Propyl@E 42.04695 42.0797 H(6)C(3) 0.0 Chemical derivative 1305 0.0 +Propyl@Any_C-term 42.04695 42.0797 H(6)C(3) 0.0 Chemical derivative 1305 OCCC 0.0 +Propyl@Protein_C-term 42.04695 42.0797 H(6)C(3) 0.0 Chemical derivative 1305 OCCC 0.0 +Propyl:2H(6)@Any_N-term 48.084611 48.1167 2H(6)C(3) 0.0 Isotopic label 1306 0.0 +Propyl:2H(6)@K 48.084611 48.1167 2H(6)C(3) 0.0 Isotopic label 1306 0.0 +Propiophenone@C 132.057515 132.1592 H(8)C(9)O(1) 0.0 Chemical derivative 1310 0.0 +Propiophenone@W 132.057515 132.1592 H(8)C(9)O(1) 0.0 Chemical derivative 1310 0.0 +Propiophenone@T 132.057515 132.1592 H(8)C(9)O(1) 0.0 Chemical derivative 1310 0.0 +Propiophenone@S 132.057515 132.1592 H(8)C(9)O(1) 0.0 Chemical derivative 1310 0.0 +Propiophenone@R 132.057515 132.1592 H(8)C(9)O(1) 0.0 Chemical derivative 1310 0.0 +Propiophenone@K 132.057515 132.1592 H(8)C(9)O(1) 0.0 Chemical derivative 1310 0.0 +Propiophenone@H 132.057515 132.1592 H(8)C(9)O(1) 0.0 Chemical derivative 1310 0.0 +PS_Hapten@H 120.021129 120.1055 H(4)C(7)O(2) 0.0 Chemical derivative 1345 0.0 +PS_Hapten@C 120.021129 120.1055 H(4)C(7)O(2) 0.0 Chemical derivative 1345 0.0 +PS_Hapten@K 120.021129 120.1055 H(4)C(7)O(2) 0.0 Chemical derivative 1345 0.0 +Cy3-maleimide@C 753.262796 753.9046 H(45)C(37)N(4)O(9)S(2) 0.0 Chemical derivative 1348 0.0 +Delta:H(6)C(3)O(1)@Protein_N-term 58.041865 58.0791 H(6)C(3)O(1) 0.0 Chemical derivative 1312 0.0 +Delta:H(6)C(3)O(1)@K 58.041865 58.0791 H(6)C(3)O(1) 0.0 Chemical derivative 1312 0.0 +Delta:H(6)C(3)O(1)@H 58.041865 58.0791 H(6)C(3)O(1) 0.0 Chemical derivative 1312 0.0 +Delta:H(6)C(3)O(1)@C 58.041865 58.0791 H(6)C(3)O(1) 0.0 Chemical derivative 1312 0.0 +Delta:H(8)C(6)O(1)@Protein_N-term 96.057515 96.1271 H(8)C(6)O(1) 0.0 Chemical derivative 1313 0.0 +Delta:H(8)C(6)O(1)@K 96.057515 96.1271 H(8)C(6)O(1) 0.0 Chemical derivative 1313 0.0 +biotinAcrolein298@H 298.146347 298.4044 H(22)C(13)N(4)O(2)S(1) 0.0 Chemical derivative 1314 0.0 +biotinAcrolein298@K 298.146347 298.4044 H(22)C(13)N(4)O(2)S(1) 0.0 Chemical derivative 1314 0.0 +biotinAcrolein298@Protein_N-term 298.146347 298.4044 H(22)C(13)N(4)O(2)S(1) 0.0 Chemical derivative 1314 0.0 +biotinAcrolein298@C 298.146347 298.4044 H(22)C(13)N(4)O(2)S(1) 0.0 Chemical derivative 1314 0.0 +MM-diphenylpentanone@C 265.146664 265.3496 H(19)C(18)N(1)O(1) 0.0 Chemical derivative 1315 0.0 +EHD-diphenylpentanone@M 266.13068 266.3343 H(18)C(18)O(2) 0.0 Chemical derivative 1317 0.0 +EHD-diphenylpentanone@C 266.13068 266.3343 H(18)C(18)O(2) 0.0 Chemical derivative 1317 0.0 +benzylguanidine@K 132.068748 132.1625 H(8)C(8)N(2) 0.0 Chemical derivative 1349 0.0 +CarboxymethylDMAP@Any_N-term 162.079313 162.1885 H(10)C(9)N(2)O(1) 0.0 Chemical derivative 1350 0.0 +Biotin:Thermo-21901+2H2O@C 561.246849 561.6489 H(39)C(23)N(5)O(9)S(1) 0.0 Chemical derivative 1320 0.0 +DiLeu4plex115@K 145.12 145.1966 H(15)C(7)13C(1)15N(1)18O(1) 0.0 Isotopic label 1321 0.0 +DiLeu4plex115@Any_N-term 145.12 145.1966 H(15)C(7)13C(1)15N(1)18O(1) 0.0 Isotopic label 1321 0.0 +DiLeu4plex115@Y 145.12 145.1966 H(15)C(7)13C(1)15N(1)18O(1) 0.0 Isotopic label 1321 0.0 +DiLeu4plex@Any_N-term 145.132163 145.2229 H(13)2H(2)C(8)N(1)18O(1) 0.0 Isotopic label 1322 0.0 +DiLeu4plex@K 145.132163 145.2229 H(13)2H(2)C(8)N(1)18O(1) 0.0 Isotopic label 1322 0.0 +DiLeu4plex@Y 145.132163 145.2229 H(13)2H(2)C(8)N(1)18O(1) 0.0 Isotopic label 1322 0.0 +DiLeu4plex117@K 145.128307 145.2092 H(13)2H(2)C(7)13C(1)15N(1)O(1) 0.0 Isotopic label 1323 0.0 +DiLeu4plex117@Any_N-term 145.128307 145.2092 H(13)2H(2)C(7)13C(1)15N(1)O(1) 0.0 Isotopic label 1323 0.0 +DiLeu4plex117@Y 145.128307 145.2092 H(13)2H(2)C(7)13C(1)15N(1)O(1) 0.0 Isotopic label 1323 0.0 +DiLeu4plex118@K 145.140471 145.2354 H(11)2H(4)C(8)N(1)O(1) 0.0 Isotopic label 1324 0.0 +DiLeu4plex118@Any_N-term 145.140471 145.2354 H(11)2H(4)C(8)N(1)O(1) 0.0 Isotopic label 1324 0.0 +DiLeu4plex118@Y 145.140471 145.2354 H(11)2H(4)C(8)N(1)O(1) 0.0 Isotopic label 1324 0.0 +Xlink:BuUrBu[213]@Protein_N-term 213.111341 213.2337 H(15)C(9)N(3)O(3) 0.0 Chemical derivative 1887 0.0 +Xlink:BuUrBu[213]@S 213.111341 213.2337 H(15)C(9)N(3)O(3) 0.0 Chemical derivative 1887 0.0 +Xlink:BuUrBu[213]@K 213.111341 213.2337 H(15)C(9)N(3)O(3) 0.0 Chemical derivative 1887 0.0 +Xlink:BuUrBu[213]@T 213.111341 213.2337 H(15)C(9)N(3)O(3) 0.0 Chemical derivative 1887 0.0 +Xlink:BuUrBu[213]@Y 213.111341 213.2337 H(15)C(9)N(3)O(3) 0.0 Chemical derivative 1887 0.0 +bisANS-sulfonates@T 434.178299 434.5305 H(22)C(32)N(2) 0.0 Chemical derivative 1330 0.0 +bisANS-sulfonates@S 434.178299 434.5305 H(22)C(32)N(2) 0.0 Chemical derivative 1330 0.0 +bisANS-sulfonates@K 434.178299 434.5305 H(22)C(32)N(2) 0.0 Chemical derivative 1330 0.0 +DNCB_hapten@Y 166.001457 166.0911 H(2)C(6)N(2)O(4) 0.0 Chemical derivative 1331 0.0 +DNCB_hapten@H 166.001457 166.0911 H(2)C(6)N(2)O(4) 0.0 Chemical derivative 1331 0.0 +DNCB_hapten@K 166.001457 166.0911 H(2)C(6)N(2)O(4) 0.0 Chemical derivative 1331 0.0 +DNCB_hapten@C 166.001457 166.0911 H(2)C(6)N(2)O(4) 0.0 Chemical derivative 1331 0.0 +NEMsulfur@C 157.019749 157.1903 H(7)C(6)N(1)O(2)S(1) 0.0 Chemical derivative 1326 0.0 +SulfurDioxide@C 63.9619 64.0638 O(2)S(1) 0.0 Post-translational 1327 0.0 +NEMsulfurWater@C 175.030314 175.2056 H(9)C(6)N(1)O(3)S(1) 0.0 Chemical derivative 1328 0.0 +HN3_mustard@C 131.094629 131.1729 H(13)C(6)N(1)O(2) 0.0 Post-translational 1389 0.0 +HN3_mustard@H 131.094629 131.1729 H(13)C(6)N(1)O(2) 0.0 Post-translational 1389 0.0 +HN3_mustard@K 131.094629 131.1729 H(13)C(6)N(1)O(2) 0.0 Post-translational 1389 0.0 +3-phosphoglyceryl@K 167.982375 168.042 H(5)C(3)O(6)P(1) 0.0 Post-translational 1387 0.0 +HN2_mustard@H 101.084064 101.1469 H(11)C(5)N(1)O(1) 0.0 Post-translational 1388 0.0 +HN2_mustard@K 101.084064 101.1469 H(11)C(5)N(1)O(1) 0.0 Post-translational 1388 0.0 +HN2_mustard@C 101.084064 101.1469 H(11)C(5)N(1)O(1) 0.0 Post-translational 1388 0.0 +NEM:2H(5)+H2O@C 148.089627 148.1714 H(4)2H(5)C(6)N(1)O(3) 0.0 Chemical derivative 1358 0.0 +Crotonyl@K 68.026215 68.074 H(4)C(4)O(1) 0.0 Post-translational 1363 CC=CC(=O)NCCCCC(C(=O)[Rn])N([Xe])([Xe]) 0.0 +O-Et-N-diMePhospho@S 135.044916 135.1015 H(10)C(4)N(1)O(2)P(1) 0.0 Chemical derivative 1364 0.0 +N-dimethylphosphate@S 107.013615 107.0483 H(6)C(2)N(1)O(2)P(1) 0.0 Chemical derivative 1365 0.0 +phosphoRibosyl@E 212.00859 212.0945 H(9)C(5)O(7)P(1) 0.0 Post-translational 1356 0.0 +phosphoRibosyl@R 212.00859 212.0945 H(9)C(5)O(7)P(1) 0.0 Post-translational 1356 0.0 +phosphoRibosyl@D 212.00859 212.0945 H(9)C(5)O(7)P(1) 0.0 Post-translational 1356 0.0 +azole@C -20.026215 -20.0312 H(-4)O(-1) 0.0 Post-translational 1355 0.0 +azole@S -20.026215 -20.0312 H(-4)O(-1) 0.0 Post-translational 1355 0.0 +Biotin:Thermo-21911@C 921.461652 922.0913 H(71)C(41)N(5)O(16)S(1) 0.0 Chemical derivative 1340 0.0 +iodoTMT@K 324.216141 324.4185 H(28)C(16)N(4)O(3) 0.0 Chemical derivative 1341 0.0 +iodoTMT@H 324.216141 324.4185 H(28)C(16)N(4)O(3) 0.0 Chemical derivative 1341 0.0 +iodoTMT@E 324.216141 324.4185 H(28)C(16)N(4)O(3) 0.0 Chemical derivative 1341 0.0 +iodoTMT@D 324.216141 324.4185 H(28)C(16)N(4)O(3) 0.0 Chemical derivative 1341 0.0 +iodoTMT@C 324.216141 324.4185 H(28)C(16)N(4)O(3) 0.0 Chemical derivative 1341 0.0 +iodoTMT6plex@K 329.226595 329.3825 H(28)C(12)13C(4)N(3)15N(1)O(3) 0.0 Chemical derivative 1342 0.0 +iodoTMT6plex@H 329.226595 329.3825 H(28)C(12)13C(4)N(3)15N(1)O(3) 0.0 Chemical derivative 1342 0.0 +iodoTMT6plex@E 329.226595 329.3825 H(28)C(12)13C(4)N(3)15N(1)O(3) 0.0 Chemical derivative 1342 0.0 +iodoTMT6plex@D 329.226595 329.3825 H(28)C(12)13C(4)N(3)15N(1)O(3) 0.0 Chemical derivative 1342 0.0 +iodoTMT6plex@C 329.226595 329.3825 H(28)C(12)13C(4)N(3)15N(1)O(3) 0.0 Chemical derivative 1342 0.0 +Label:13C(2)15N(2)@K 4.00078 3.9721 C(-2)13C(2)N(-2)15N(2) 0.0 Isotopic label 1787 0.0 +Phosphogluconoylation@Any_N-term 258.014069 258.1199 H(11)C(6)O(9)P(1) 0.0 Post-translational 1344 0.0 +Phosphogluconoylation@K 258.014069 258.1199 H(11)C(6)O(9)P(1) 0.0 Post-translational 1344 0.0 +Methyl:2H(3)+Acetyl:2H(3)@K 62.063875 62.1002 H(-2)2H(6)C(3)O(1) 0.0 Isotopic label 1368 0.0 +dHex(1)Hex(1)@T 308.110732 308.2818 H(20)C(12)O(9) 308.110732 H(20)C(12)O(9) O-linked glycosylation 1367 0.5 +dHex(1)Hex(1)@S 308.110732 308.2818 H(20)C(12)O(9) 308.110732 H(20)C(12)O(9) O-linked glycosylation 1367 0.5 +methylsulfonylethyl@K 106.00885 106.1435 H(6)C(3)O(2)S(1) 0.0 Chemical derivative 1380 0.0 +methylsulfonylethyl@H 106.00885 106.1435 H(6)C(3)O(2)S(1) 0.0 Chemical derivative 1380 0.0 +methylsulfonylethyl@C 106.00885 106.1435 H(6)C(3)O(2)S(1) 0.0 Chemical derivative 1380 0.0 +Label:2H(3)+Oxidation@M 19.013745 19.0179 H(-3)2H(3)O(1) 0.0 Isotopic label 1370 0.0 +Trimethyl:2H(9)@R 51.103441 51.1352 H(-3)2H(9)C(3) 0.0 Isotopic label 1371 0.0 +Trimethyl:2H(9)@K 51.103441 51.1352 H(-3)2H(9)C(3) 0.0 Isotopic label 1371 0.0 +Acetyl:13C(2)@K 44.017274 44.022 H(2)13C(2)O(1) 0.0 Isotopic label 1372 0.0 +Acetyl:13C(2)@Protein_N-term 44.017274 44.022 H(2)13C(2)O(1) 0.0 Isotopic label 1372 0.0 +dHex(1)Hex(2)@T 470.163556 470.4224 H(30)C(18)O(14) 470.163556 H(30)C(18)O(14) O-linked glycosylation 1375 0.5 +dHex(1)Hex(2)@S 470.163556 470.4224 H(30)C(18)O(14) 470.163556 H(30)C(18)O(14) O-linked glycosylation 1375 0.5 +dHex(1)Hex(3)@T 632.216379 632.563 H(40)C(24)O(19) 632.216379 H(40)C(24)O(19) O-linked glycosylation 1376 0.5 +dHex(1)Hex(3)@S 632.216379 632.563 H(40)C(24)O(19) 632.216379 H(40)C(24)O(19) O-linked glycosylation 1376 0.5 +dHex(1)Hex(4)@T 794.269203 794.7036 H(50)C(30)O(24) 794.269203 H(50)C(30)O(24) O-linked glycosylation 1377 0.5 +dHex(1)Hex(4)@S 794.269203 794.7036 H(50)C(30)O(24) 794.269203 H(50)C(30)O(24) O-linked glycosylation 1377 0.5 +dHex(1)Hex(5)@T 956.322026 956.8442 H(60)C(36)O(29) 956.322026 H(60)C(36)O(29) O-linked glycosylation 1378 0.5 +dHex(1)Hex(5)@S 956.322026 956.8442 H(60)C(36)O(29) 956.322026 H(60)C(36)O(29) O-linked glycosylation 1378 0.5 +dHex(1)Hex(6)@T 1118.37485 1118.9848 H(70)C(42)O(34) 1118.37485 H(70)C(42)O(34) O-linked glycosylation 1379 0.5 +dHex(1)Hex(6)@S 1118.37485 1118.9848 H(70)C(42)O(34) 1118.37485 H(70)C(42)O(34) O-linked glycosylation 1379 0.5 +ethylsulfonylethyl@H 120.0245 120.1701 H(8)C(4)O(2)S(1) 0.0 Chemical derivative 1381 0.0 +ethylsulfonylethyl@C 120.0245 120.1701 H(8)C(4)O(2)S(1) 0.0 Chemical derivative 1381 0.0 +ethylsulfonylethyl@K 120.0245 120.1701 H(8)C(4)O(2)S(1) 0.0 Chemical derivative 1381 0.0 +phenylsulfonylethyl@C 168.0245 168.2129 H(8)C(8)O(2)S(1) 0.0 Chemical derivative 1382 0.0 +PyridoxalPhosphateH2@K 231.02966 231.1425 H(10)C(8)N(1)O(5)P(1) 0.0 Chemical derivative 1383 0.0 +Homocysteic_acid@M 33.969094 33.9716 H(-2)C(-1)O(3) 0.0 Artefact 1384 0.0 +Hydroxamic_acid@E 15.010899 15.0146 H(1)N(1) 0.0 Artefact 1385 0.0 +Hydroxamic_acid@D 15.010899 15.0146 H(1)N(1) 0.0 Artefact 1385 0.0 +Oxidation+NEM@C 141.042593 141.1247 H(7)C(6)N(1)O(3) 0.0 Chemical derivative 1390 0.0 +NHS-fluorescein@K 471.131802 471.4581 H(21)C(27)N(1)O(7) 0.0 Chemical derivative 1391 0.0 +DiART6plex@Y 217.162932 217.2527 H(20)C(7)13C(4)N(1)15N(1)O(2) 0.0 Isotopic label 1392 0.0 +DiART6plex@Protein_N-term 217.162932 217.2527 H(20)C(7)13C(4)N(1)15N(1)O(2) 0.0 Isotopic label 1392 0.0 +DiART6plex@Any_N-term 217.162932 217.2527 H(20)C(7)13C(4)N(1)15N(1)O(2) 0.0 Isotopic label 1392 0.0 +DiART6plex@K 217.162932 217.2527 H(20)C(7)13C(4)N(1)15N(1)O(2) 0.0 Isotopic label 1392 0.0 +DiART6plex115@K 217.156612 217.2535 H(20)C(8)13C(3)15N(2)O(2) 0.0 Isotopic label 1393 0.0 +DiART6plex115@Any_N-term 217.156612 217.2535 H(20)C(8)13C(3)15N(2)O(2) 0.0 Isotopic label 1393 0.0 +DiART6plex115@Protein_N-term 217.156612 217.2535 H(20)C(8)13C(3)15N(2)O(2) 0.0 Isotopic label 1393 0.0 +DiART6plex115@Y 217.156612 217.2535 H(20)C(8)13C(3)15N(2)O(2) 0.0 Isotopic label 1393 0.0 +DiART6plex116/119@Y 217.168776 217.2797 H(18)2H(2)C(9)13C(2)N(1)15N(1)O(2) 0.0 Isotopic label 1394 0.0 +DiART6plex116/119@Protein_N-term 217.168776 217.2797 H(18)2H(2)C(9)13C(2)N(1)15N(1)O(2) 0.0 Isotopic label 1394 0.0 +DiART6plex116/119@K 217.168776 217.2797 H(18)2H(2)C(9)13C(2)N(1)15N(1)O(2) 0.0 Isotopic label 1394 0.0 +DiART6plex116/119@Any_N-term 217.168776 217.2797 H(18)2H(2)C(9)13C(2)N(1)15N(1)O(2) 0.0 Isotopic label 1394 0.0 +DiART6plex117@K 217.162456 217.2805 H(18)2H(2)C(10)13C(1)15N(2)O(2) 0.0 Isotopic label 1395 0.0 +DiART6plex117@Any_N-term 217.162456 217.2805 H(18)2H(2)C(10)13C(1)15N(2)O(2) 0.0 Isotopic label 1395 0.0 +DiART6plex117@Protein_N-term 217.162456 217.2805 H(18)2H(2)C(10)13C(1)15N(2)O(2) 0.0 Isotopic label 1395 0.0 +DiART6plex117@Y 217.162456 217.2805 H(18)2H(2)C(10)13C(1)15N(2)O(2) 0.0 Isotopic label 1395 0.0 +DiART6plex118@K 217.175096 217.279 H(18)2H(2)C(8)13C(3)N(2)O(2) 0.0 Isotopic label 1396 0.0 +DiART6plex118@Any_N-term 217.175096 217.279 H(18)2H(2)C(8)13C(3)N(2)O(2) 0.0 Isotopic label 1396 0.0 +DiART6plex118@Protein_N-term 217.175096 217.279 H(18)2H(2)C(8)13C(3)N(2)O(2) 0.0 Isotopic label 1396 0.0 +DiART6plex118@Y 217.175096 217.279 H(18)2H(2)C(8)13C(3)N(2)O(2) 0.0 Isotopic label 1396 0.0 +Iodoacetanilide@K 133.052764 133.1473 H(7)C(8)N(1)O(1) 0.0 Artefact 1397 0.0 +Iodoacetanilide@C 133.052764 133.1473 H(7)C(8)N(1)O(1) 0.0 Chemical derivative 1397 0.0 +Iodoacetanilide@Any_N-term 133.052764 133.1473 H(7)C(8)N(1)O(1) 0.0 Artefact 1397 0.0 +Iodoacetanilide:13C(6)@K 139.072893 139.1032 H(7)C(2)13C(6)N(1)O(1) 0.0 Artefact 1398 0.0 +Iodoacetanilide:13C(6)@C 139.072893 139.1032 H(7)C(2)13C(6)N(1)O(1) 0.0 Chemical derivative 1398 0.0 +Iodoacetanilide:13C(6)@Any_N-term 139.072893 139.1032 H(7)C(2)13C(6)N(1)O(1) 0.0 Artefact 1398 0.0 +Dap-DSP@K 364.076278 364.4377 H(20)C(13)N(2)O(6)S(2) 0.0 Chemical derivative 1399 0.0 +Dap-DSP@E 364.076278 364.4377 H(20)C(13)N(2)O(6)S(2) 0.0 Non-standard residue 1399 0.0 +Dap-DSP@A 364.076278 364.4377 H(20)C(13)N(2)O(6)S(2) 0.0 Non-standard residue 1399 0.0 +MurNAc@A 275.100502 275.2552 H(17)C(11)N(1)O(7) 0.0 Other glycosylation 1400 0.0 +EEEDVIEVYQEQTGG@K 1705.73189 1706.7153 H(107)C(72)N(17)O(31) 0.0 Chemical derivative 1405 0.0 +Label:2H(7)15N(4)@R 11.032077 11.0168 H(-7)2H(7)N(-4)15N(4) 0.0 Isotopic label 1402 0.0 +Label:2H(6)15N(1)@P 7.034695 7.0304 H(-6)2H(6)N(-1)15N(1) 0.0 Isotopic label 1403 0.0 +EDEDTIDVFQQQTGG@K 1662.700924 1663.6508 H(102)C(69)N(18)O(30) 0.0 Chemical derivative 1406 0.0 +Hex(5)HexNAc(4)NeuAc(2)@N 2204.772441 2205.9822 H(136)C(84)N(6)O(61) 2204.772441 H(136)C(84)N(6)O(61) N-linked glycosylation 1408 0.5 +Hex(5)HexNAc(4)NeuAc(1)@N 1913.677025 1914.7277 H(119)C(73)N(5)O(53) 1913.677025 H(119)C(73)N(5)O(53) N-linked glycosylation 1409 0.5 +dHex(1)Hex(5)HexNAc(4)NeuAc(1)@N 2059.734933 2060.8689 H(129)C(79)N(5)O(57) 2059.734933 H(129)C(79)N(5)O(57) N-linked glycosylation 1410 0.5 +dHex(1)Hex(5)HexNAc(4)NeuAc(2)@N 2350.83035 2352.1234 H(146)C(90)N(6)O(65) 2350.83035 H(146)C(90)N(6)O(65) N-linked glycosylation 1411 0.5 +s-GlcNAc@T 283.036187 283.2557 H(13)C(8)N(1)O(8)S(1) 283.036187 H(13)C(8)N(1)O(8)S(1) O-linked glycosylation 1412 0.5 +s-GlcNAc@S 283.036187 283.2557 H(13)C(8)N(1)O(8)S(1) 283.036187 H(13)C(8)N(1)O(8)S(1) O-linked glycosylation 1412 0.5 +PhosphoHex(2)@N 404.071978 404.2611 H(21)C(12)O(13)P(1) 404.071978 H(21)C(12)O(13)P(1) N-linked glycosylation 1413 0.5 +PhosphoHex(2)@T 404.071978 404.2611 H(21)C(12)O(13)P(1) 404.071978 H(21)C(12)O(13)P(1) O-linked glycosylation 1413 0.5 +PhosphoHex(2)@S 404.071978 404.2611 H(21)C(12)O(13)P(1) 404.071978 H(21)C(12)O(13)P(1) O-linked glycosylation 1413 0.5 +Trimethyl:13C(3)2H(9)@K 54.113505 54.1132 H(-3)2H(9)13C(3) 0.0 Isotopic label 1414 0.0 +Trimethyl:13C(3)2H(9)@R 54.113505 54.1132 H(-3)2H(9)13C(3) 0.0 Isotopic label 1414 0.0 +15N-oxobutanoic@S^Protein_N-term -18.023584 -18.0239 H(-3)15N(-1) 0.0 Post-translational 1419 0.0 +15N-oxobutanoic@C^Any_N-term -18.023584 -18.0239 H(-3)15N(-1) 0.0 Artefact 1419 0.0 +15N-oxobutanoic@T^Protein_N-term -18.023584 -18.0239 H(-3)15N(-1) 0.0 Post-translational 1419 0.0 +spermidine@Q 128.131349 128.2153 H(16)C(7)N(2) 0.0 Chemical derivative 1421 0.0 +Biotin:Thermo-21330@Any_N-term 473.219571 473.5835 H(35)C(21)N(3)O(7)S(1) 0.0 Chemical derivative 1423 0.0 +Biotin:Thermo-21330@K 473.219571 473.5835 H(35)C(21)N(3)O(7)S(1) 0.0 Chemical derivative 1423 0.0 +Hex(1)Pent(2)@T 426.137341 426.3698 H(26)C(16)O(13) 426.137341 H(26)C(16)O(13) O-linked glycosylation 1428 0.5 +Hex(1)Pent(2)@S 426.137341 426.3698 H(26)C(16)O(13) 426.137341 H(26)C(16)O(13) O-linked glycosylation 1428 0.5 +Pentose@T 132.042259 132.1146 H(8)C(5)O(4) 132.042259 H(8)C(5)O(4) O-linked glycosylation 1425 0.5 +Pentose@S 132.042259 132.1146 H(8)C(5)O(4) 132.042259 H(8)C(5)O(4) O-linked glycosylation 1425 0.5 +Hex(1)Pent(1)@T 294.095082 294.2552 H(18)C(11)O(9) 294.095082 H(18)C(11)O(9) O-linked glycosylation 1426 0.5 +Hex(1)Pent(1)@S 294.095082 294.2552 H(18)C(11)O(9) 294.095082 H(18)C(11)O(9) O-linked glycosylation 1426 0.5 +Hex(1)HexA(1)@T 338.084912 338.2647 H(18)C(12)O(11) 338.084912 H(18)C(12)O(11) O-linked glycosylation 1427 0.5 +Hex(1)HexA(1)@S 338.084912 338.2647 H(18)C(12)O(11) 338.084912 H(18)C(12)O(11) O-linked glycosylation 1427 0.5 +Hex(1)HexNAc(1)Phos(1)@T 445.098527 445.313 H(24)C(14)N(1)O(13)P(1) 445.098527 H(24)C(14)N(1)O(13)P(1) O-linked glycosylation 1429 0.5 +Hex(1)HexNAc(1)Phos(1)@S 445.098527 445.313 H(24)C(14)N(1)O(13)P(1) 445.098527 H(24)C(14)N(1)O(13)P(1) O-linked glycosylation 1429 0.5 +Hex(1)HexNAc(1)Sulf(1)@T 445.089011 445.3963 H(23)C(14)N(1)O(13)S(1) 445.089011 H(23)C(14)N(1)O(13)S(1) O-linked glycosylation 1430 0.5 +Hex(1)HexNAc(1)Sulf(1)@S 445.089011 445.3963 H(23)C(14)N(1)O(13)S(1) 445.089011 H(23)C(14)N(1)O(13)S(1) O-linked glycosylation 1430 0.5 +Hex(1)NeuAc(1)@T 453.14824 453.3952 H(27)C(17)N(1)O(13) 453.14824 H(27)C(17)N(1)O(13) O-linked glycosylation 1431 0.5 +Hex(1)NeuAc(1)@S 453.14824 453.3952 H(27)C(17)N(1)O(13) 453.14824 H(27)C(17)N(1)O(13) O-linked glycosylation 1431 0.5 +Hex(1)NeuGc(1)@T 469.143155 469.3946 H(27)C(17)N(1)O(14) 469.143155 H(27)C(17)N(1)O(14) O-linked glycosylation 1432 0.5 +Hex(1)NeuGc(1)@S 469.143155 469.3946 H(27)C(17)N(1)O(14) 469.143155 H(27)C(17)N(1)O(14) O-linked glycosylation 1432 0.5 +HexNAc(3)@T 609.238118 609.5776 H(39)C(24)N(3)O(15) 609.238118 H(39)C(24)N(3)O(15) O-linked glycosylation 1433 0.5 +HexNAc(3)@S 609.238118 609.5776 H(39)C(24)N(3)O(15) 609.238118 H(39)C(24)N(3)O(15) O-linked glycosylation 1433 0.5 +HexNAc(1)NeuAc(1)@T 494.174789 494.4471 H(30)C(19)N(2)O(13) 494.174789 H(30)C(19)N(2)O(13) O-linked glycosylation 1434 0.5 +HexNAc(1)NeuAc(1)@S 494.174789 494.4471 H(30)C(19)N(2)O(13) 494.174789 H(30)C(19)N(2)O(13) O-linked glycosylation 1434 0.5 +HexNAc(1)NeuGc(1)@T 510.169704 510.4465 H(30)C(19)N(2)O(14) 510.169704 H(30)C(19)N(2)O(14) O-linked glycosylation 1435 0.5 +HexNAc(1)NeuGc(1)@S 510.169704 510.4465 H(30)C(19)N(2)O(14) 510.169704 H(30)C(19)N(2)O(14) O-linked glycosylation 1435 0.5 +Hex(2)NeuAc(1)@T 615.201064 615.5358 H(37)C(23)N(1)O(18) 615.201064 H(37)C(23)N(1)O(18) O-linked glycosylation 1444 0.5 +Hex(2)NeuAc(1)@S 615.201064 615.5358 H(37)C(23)N(1)O(18) 615.201064 H(37)C(23)N(1)O(18) O-linked glycosylation 1444 0.5 +Hex(1)HexNAc(1)dHex(1)Me(1)@T 525.205755 525.5009 H(35)C(21)N(1)O(14) 525.205755 H(35)C(21)N(1)O(14) O-linked glycosylation 1436 0.5 +Hex(1)HexNAc(1)dHex(1)Me(1)@S 525.205755 525.5009 H(35)C(21)N(1)O(14) 525.205755 H(35)C(21)N(1)O(14) O-linked glycosylation 1436 0.5 +Hex(1)HexNAc(1)dHex(1)Me(2)@T 539.221405 539.5275 H(37)C(22)N(1)O(14) 539.221405 H(37)C(22)N(1)O(14) O-linked glycosylation 1437 0.5 +Hex(1)HexNAc(1)dHex(1)Me(2)@S 539.221405 539.5275 H(37)C(22)N(1)O(14) 539.221405 H(37)C(22)N(1)O(14) O-linked glycosylation 1437 0.5 +Xlink:DSS[155]@Protein_N-term 155.094629 155.1943 H(13)C(8)N(1)O(2) 0.0 Chemical derivative 1789 0.0 +Xlink:DSS[155]@K 155.094629 155.1943 H(13)C(8)N(1)O(2) 0.0 Chemical derivative 1789 0.0 +Hex(2)HexNAc(1)@N 527.18502 527.4737 H(33)C(20)N(1)O(15) 527.18502 H(33)C(20)N(1)O(15) N-linked glycosylation 1438 0.5 +Hex(2)HexNAc(1)@T 527.18502 527.4737 H(33)C(20)N(1)O(15) 527.18502 H(33)C(20)N(1)O(15) O-linked glycosylation 1438 0.5 +Hex(2)HexNAc(1)@S 527.18502 527.4737 H(33)C(20)N(1)O(15) 527.18502 H(33)C(20)N(1)O(15) O-linked glycosylation 1438 0.5 +Hex(1)HexA(1)HexNAc(1)@T 541.164284 541.4572 H(31)C(20)N(1)O(16) 541.164284 H(31)C(20)N(1)O(16) O-linked glycosylation 1439 0.5 +Hex(1)HexA(1)HexNAc(1)@S 541.164284 541.4572 H(31)C(20)N(1)O(16) 541.164284 H(31)C(20)N(1)O(16) O-linked glycosylation 1439 0.5 +Hex(2)HexNAc(1)Me(1)@T 541.20067 541.5003 H(35)C(21)N(1)O(15) 541.20067 H(35)C(21)N(1)O(15) O-linked glycosylation 1440 0.5 +Hex(2)HexNAc(1)Me(1)@S 541.20067 541.5003 H(35)C(21)N(1)O(15) 541.20067 H(35)C(21)N(1)O(15) O-linked glycosylation 1440 0.5 +Hex(1)Pent(3)@T 558.1796 558.4845 H(34)C(21)O(17) 558.1796 H(34)C(21)O(17) O-linked glycosylation 1441 0.5 +Hex(1)Pent(3)@S 558.1796 558.4845 H(34)C(21)O(17) 558.1796 H(34)C(21)O(17) O-linked glycosylation 1441 0.5 +Hex(1)NeuAc(1)Pent(1)@S 585.190499 585.5098 H(35)C(22)N(1)O(17) 585.190499 H(35)C(22)N(1)O(17) O-linked glycosylation 1442 0.5 +Hex(1)NeuAc(1)Pent(1)@T 585.190499 585.5098 H(35)C(22)N(1)O(17) 585.190499 H(35)C(22)N(1)O(17) O-linked glycosylation 1442 0.5 +Hex(2)HexNAc(1)Sulf(1)@T 607.141834 607.5369 H(33)C(20)N(1)O(18)S(1) 607.141834 H(33)C(20)N(1)O(18)S(1) O-linked glycosylation 1443 0.5 +Hex(2)HexNAc(1)Sulf(1)@S 607.141834 607.5369 H(33)C(20)N(1)O(18)S(1) 607.141834 H(33)C(20)N(1)O(18)S(1) O-linked glycosylation 1443 0.5 +dHex(2)Hex(2)@S 616.221465 616.5636 H(40)C(24)O(18) 616.221465 H(40)C(24)O(18) O-linked glycosylation 1445 0.5 +dHex(2)Hex(2)@T 616.221465 616.5636 H(40)C(24)O(18) 616.221465 H(40)C(24)O(18) O-linked glycosylation 1445 0.5 +dHex(1)Hex(2)HexA(1)@S 646.195644 646.5465 H(38)C(24)O(20) 646.195644 H(38)C(24)O(20) O-linked glycosylation 1446 0.5 +dHex(1)Hex(2)HexA(1)@T 646.195644 646.5465 H(38)C(24)O(20) 646.195644 H(38)C(24)O(20) O-linked glycosylation 1446 0.5 +Hex(1)HexNAc(2)Sulf(1)@T 648.168383 648.5888 H(36)C(22)N(2)O(18)S(1) 648.168383 H(36)C(22)N(2)O(18)S(1) O-linked glycosylation 1447 0.5 +Hex(1)HexNAc(2)Sulf(1)@S 648.168383 648.5888 H(36)C(22)N(2)O(18)S(1) 648.168383 H(36)C(22)N(2)O(18)S(1) O-linked glycosylation 1447 0.5 +Hex(4)@S 648.211294 648.5624 H(40)C(24)O(20) 648.211294 H(40)C(24)O(20) O-linked glycosylation 1448 0.5 +Hex(4)@T 648.211294 648.5624 H(40)C(24)O(20) 648.211294 H(40)C(24)O(20) O-linked glycosylation 1448 0.5 +dHex(1)Hex(2)HexNAc(2)Pent(1)@N 1008.36456 1008.9221 H(64)C(39)N(2)O(28) 1008.36456 H(64)C(39)N(2)O(28) N-linked glycosylation 1449 0.5 +Hex(2)HexNAc(2)NeuAc(1)@N 1021.359809 1021.9208 H(63)C(39)N(3)O(28) 1021.359809 H(63)C(39)N(3)O(28) N-linked glycosylation 1450 0.5 +Hex(2)HexNAc(2)NeuAc(1)@S 1021.359809 1021.9208 H(63)C(39)N(3)O(28) 1021.359809 H(63)C(39)N(3)O(28) O-linked glycosylation 1450 0.5 +Hex(2)HexNAc(2)NeuAc(1)@T 1021.359809 1021.9208 H(63)C(39)N(3)O(28) 1021.359809 H(63)C(39)N(3)O(28) O-linked glycosylation 1450 0.5 +Hex(3)HexNAc(2)Pent(1)@N 1024.359475 1024.9215 H(64)C(39)N(2)O(29) 1024.359475 H(64)C(39)N(2)O(29) N-linked glycosylation 1451 0.5 +Hex(4)HexNAc(2)@N 1054.370039 1054.9474 H(66)C(40)N(2)O(30) 1054.370039 H(66)C(40)N(2)O(30) N-linked glycosylation 1452 0.5 +dHex(1)Hex(4)HexNAc(1)Pent(1)@N 1129.390834 1130.0107 H(71)C(43)N(1)O(33) 1129.390834 H(71)C(43)N(1)O(33) N-linked glycosylation 1453 0.5 +dHex(1)Hex(3)HexNAc(2)Pent(1)@N 1170.417383 1171.0627 H(74)C(45)N(2)O(33) 1170.417383 H(74)C(45)N(2)O(33) N-linked glycosylation 1454 0.5 +Hex(3)HexNAc(2)NeuAc(1)@N 1183.412632 1184.0614 H(73)C(45)N(3)O(33) 1183.412632 H(73)C(45)N(3)O(33) N-linked glycosylation 1455 0.5 +Hex(4)HexNAc(2)Pent(1)@N 1186.412298 1187.0621 H(74)C(45)N(2)O(34) 1186.412298 H(74)C(45)N(2)O(34) N-linked glycosylation 1456 0.5 +Hex(3)HexNAc(3)Pent(1)@N 1227.438847 1228.114 H(77)C(47)N(3)O(34) 1227.438847 H(77)C(47)N(3)O(34) N-linked glycosylation 1457 0.5 +Hex(5)HexNAc(2)Phos(1)@N 1296.389194 1297.0679 H(77)C(46)N(2)O(38)P(1) 1296.389194 H(77)C(46)N(2)O(38)P(1) N-linked glycosylation 1458 0.5 +dHex(1)Hex(4)HexNAc(2)Pent(1)@N 1332.470207 1333.2033 H(84)C(51)N(2)O(38) 1332.470207 H(84)C(51)N(2)O(38) N-linked glycosylation 1459 0.5 +Hex(7)HexNAc(1)@N 1337.449137 1338.1767 H(83)C(50)N(1)O(40) 1337.449137 H(83)C(50)N(1)O(40) N-linked glycosylation 1460 0.5 +Hex(4)HexNAc(2)NeuAc(1)@N 1345.465456 1346.202 H(83)C(51)N(3)O(38) 1345.465456 H(83)C(51)N(3)O(38) N-linked glycosylation 1461 0.5 +Hex(4)HexNAc(2)NeuAc(1)@S 1345.465456 1346.202 H(83)C(51)N(3)O(38) 1345.465456 H(83)C(51)N(3)O(38) O-linked glycosylation 1461 0.5 +Hex(4)HexNAc(2)NeuAc(1)@T 1345.465456 1346.202 H(83)C(51)N(3)O(38) 1345.465456 H(83)C(51)N(3)O(38) O-linked glycosylation 1461 0.5 +dHex(1)Hex(5)HexNAc(2)@N 1362.480772 1363.2292 H(86)C(52)N(2)O(39) 1362.480772 H(86)C(52)N(2)O(39) N-linked glycosylation 1462 0.5 +dHex(1)Hex(3)HexNAc(3)Pent(1)@N 1373.496756 1374.2552 H(87)C(53)N(3)O(38) 1373.496756 H(87)C(53)N(3)O(38) N-linked glycosylation 1463 0.5 +Hex(3)HexNAc(4)Sulf(1)@N 1378.432776 1379.2551 H(82)C(50)N(4)O(38)S(1) 1378.432776 H(82)C(50)N(4)O(38)S(1) N-linked glycosylation 1464 0.5 +Hex(6)HexNAc(2)@N 1378.475686 1379.2286 H(86)C(52)N(2)O(40) 1378.475686 H(86)C(52)N(2)O(40) N-linked glycosylation 1465 0.5 +Hex(4)HexNAc(3)Pent(1)@N 1389.491671 1390.2546 H(87)C(53)N(3)O(39) 1389.491671 H(87)C(53)N(3)O(39) N-linked glycosylation 1466 0.5 +dHex(1)Hex(4)HexNAc(3)@N 1403.507321 1404.2812 H(89)C(54)N(3)O(39) 1403.507321 H(89)C(54)N(3)O(39) N-linked glycosylation 1467 0.5 +Hex(5)HexNAc(3)@N 1419.502235 1420.2806 H(89)C(54)N(3)O(40) 1419.502235 H(89)C(54)N(3)O(40) N-linked glycosylation 1468 0.5 +Hex(3)HexNAc(4)Pent(1)@N 1430.51822 1431.3065 H(90)C(55)N(4)O(39) 1430.51822 H(90)C(55)N(4)O(39) N-linked glycosylation 1469 0.5 +Hex(6)HexNAc(2)Phos(1)@N 1458.442017 1459.2085 H(87)C(52)N(2)O(43)P(1) 1458.442017 H(87)C(52)N(2)O(43)P(1) N-linked glycosylation 1470 0.5 +dHex(1)Hex(4)HexNAc(3)Sulf(1)@N 1483.464135 1484.3444 H(89)C(54)N(3)O(42)S(1) 1483.464135 H(89)C(54)N(3)O(42)S(1) N-linked glycosylation 1471 0.5 +dHex(1)Hex(5)HexNAc(2)Pent(1)@N 1494.52303 1495.3439 H(94)C(57)N(2)O(43) 1494.52303 H(94)C(57)N(2)O(43) N-linked glycosylation 1472 0.5 +Hex(8)HexNAc(1)@N 1499.501961 1500.3173 H(93)C(56)N(1)O(45) 1499.501961 H(93)C(56)N(1)O(45) N-linked glycosylation 1473 0.5 +dHex(1)Hex(3)HexNAc(3)Pent(2)@N 1505.539015 1506.3698 H(95)C(58)N(3)O(42) 1505.539015 H(95)C(58)N(3)O(42) N-linked glycosylation 1474 0.5 +dHex(2)Hex(3)HexNAc(3)Pent(1)@N 1519.554665 1520.3964 H(97)C(59)N(3)O(42) 1519.554665 H(97)C(59)N(3)O(42) N-linked glycosylation 1475 0.5 +dHex(1)Hex(3)HexNAc(4)Sulf(1)@N 1524.490684 1525.3963 H(92)C(56)N(4)O(42)S(1) 1524.490684 H(92)C(56)N(4)O(42)S(1) N-linked glycosylation 1476 0.5 +dHex(1)Hex(6)HexNAc(2)@N 1524.533595 1525.3698 H(96)C(58)N(2)O(44) 1524.533595 H(96)C(58)N(2)O(44) N-linked glycosylation 1477 0.5 +dHex(1)Hex(4)HexNAc(3)Pent(1)@N 1535.549579 1536.3958 H(97)C(59)N(3)O(43) 1535.549579 H(97)C(59)N(3)O(43) N-linked glycosylation 1478 0.5 +Hex(4)HexNAc(4)Sulf(1)@N 1540.485599 1541.3957 H(92)C(56)N(4)O(43)S(1) 1540.485599 H(92)C(56)N(4)O(43)S(1) N-linked glycosylation 1479 0.5 +Hex(7)HexNAc(2)@N 1540.52851 1541.3692 H(96)C(58)N(2)O(45) 1540.52851 H(96)C(58)N(2)O(45) N-linked glycosylation 1480 0.5 +dHex(2)Hex(4)HexNAc(3)@N 1549.56523 1550.4224 H(99)C(60)N(3)O(43) 1549.56523 H(99)C(60)N(3)O(43) N-linked glycosylation 1481 0.5 +Hex(5)HexNAc(3)Pent(1)@N 1551.544494 1552.3952 H(97)C(59)N(3)O(44) 1551.544494 H(97)C(59)N(3)O(44) N-linked glycosylation 1482 0.5 +Hex(4)HexNAc(3)NeuGc(1)@N 1564.539743 1565.3939 H(96)C(59)N(4)O(44) 1564.539743 H(96)C(59)N(4)O(44) N-linked glycosylation 1483 0.5 +dHex(1)Hex(5)HexNAc(3)@N 1565.560144 1566.4218 H(99)C(60)N(3)O(44) 1565.560144 H(99)C(60)N(3)O(44) N-linked glycosylation 1484 0.5 +dHex(1)Hex(3)HexNAc(4)Pent(1)@N 1576.576129 1577.4477 H(100)C(61)N(4)O(43) 1576.576129 H(100)C(61)N(4)O(43) N-linked glycosylation 1485 0.5 +Hex(3)HexNAc(5)Sulf(1)@N 1581.512148 1582.4476 H(95)C(58)N(5)O(43)S(1) 1581.512148 H(95)C(58)N(5)O(43)S(1) N-linked glycosylation 1486 0.5 +Hex(6)HexNAc(3)@N 1581.555059 1582.4212 H(99)C(60)N(3)O(45) 1581.555059 H(99)C(60)N(3)O(45) N-linked glycosylation 1487 0.5 +Hex(3)HexNAc(4)NeuAc(1)@N 1589.571378 1590.4465 H(99)C(61)N(5)O(43) 1589.571378 H(99)C(61)N(5)O(43) N-linked glycosylation 1488 0.5 +Hex(4)HexNAc(4)Pent(1)@N 1592.571043 1593.4471 H(100)C(61)N(4)O(44) 1592.571043 H(100)C(61)N(4)O(44) N-linked glycosylation 1489 0.5 +Hex(7)HexNAc(2)Phos(1)@N 1620.494841 1621.3491 H(97)C(58)N(2)O(48)P(1) 1620.494841 H(97)C(58)N(2)O(48)P(1) N-linked glycosylation 1490 0.5 +Hex(4)HexNAc(4)Me(2)Pent(1)@N 1620.602343 1621.5003 H(104)C(63)N(4)O(44) 1620.602343 H(104)C(63)N(4)O(44) N-linked glycosylation 1491 0.5 +dHex(1)Hex(3)HexNAc(3)Pent(3)@N 1637.581274 1638.4844 H(103)C(63)N(3)O(46) 1637.581274 H(103)C(63)N(3)O(46) N-linked glycosylation 1492 0.5 +dHex(1)Hex(5)HexNAc(3)Sulf(1)@N 1645.516959 1646.485 H(99)C(60)N(3)O(47)S(1) 1645.516959 H(99)C(60)N(3)O(47)S(1) N-linked glycosylation 1493 0.5 +dHex(2)Hex(3)HexNAc(3)Pent(2)@N 1651.596924 1652.511 H(105)C(64)N(3)O(46) 1651.596924 H(105)C(64)N(3)O(46) N-linked glycosylation 1494 0.5 +Hex(6)HexNAc(3)Phos(1)@N 1661.52139 1662.4011 H(100)C(60)N(3)O(48)P(1) 1661.52139 H(100)C(60)N(3)O(48)P(1) N-linked glycosylation 1495 0.5 +Hex(4)HexNAc(5)@N 1663.608157 1664.525 H(105)C(64)N(5)O(45) 1663.608157 H(105)C(64)N(5)O(45) N-linked glycosylation 1496 0.5 +dHex(3)Hex(3)HexNAc(3)Pent(1)@N 1665.612574 1666.5376 H(107)C(65)N(3)O(46) 1665.612574 H(107)C(65)N(3)O(46) N-linked glycosylation 1497 0.5 +dHex(2)Hex(4)HexNAc(3)Pent(1)@N 1681.607488 1682.537 H(107)C(65)N(3)O(47) 1681.607488 H(107)C(65)N(3)O(47) N-linked glycosylation 1498 0.5 +dHex(1)Hex(4)HexNAc(4)Sulf(1)@N 1686.543508 1687.5369 H(102)C(62)N(4)O(47)S(1) 1686.543508 H(102)C(62)N(4)O(47)S(1) N-linked glycosylation 1499 0.5 +dHex(1)Hex(7)HexNAc(2)@N 1686.586419 1687.5104 H(106)C(64)N(2)O(49) 1686.586419 H(106)C(64)N(2)O(49) N-linked glycosylation 1500 0.5 +dHex(1)Hex(4)HexNAc(3)NeuAc(1)@N 1694.602737 1695.5357 H(106)C(65)N(4)O(47) 1694.602737 H(106)C(65)N(4)O(47) N-linked glycosylation 1501 0.5 +dHex(1)Hex(4)HexNAc(3)NeuAc(1)@S 1694.602737 1695.5357 H(106)C(65)N(4)O(47) 1694.602737 H(106)C(65)N(4)O(47) O-linked glycosylation 1501 0.5 +dHex(1)Hex(4)HexNAc(3)NeuAc(1)@T 1694.602737 1695.5357 H(106)C(65)N(4)O(47) 1694.602737 H(106)C(65)N(4)O(47) O-linked glycosylation 1501 0.5 +Hex(7)HexNAc(2)Phos(2)@N 1700.461172 1701.329 H(98)C(58)N(2)O(51)P(2) 1700.461172 H(98)C(58)N(2)O(51)P(2) N-linked glycosylation 1502 0.5 +Hex(5)HexNAc(4)Sulf(1)@N 1702.538423 1703.5363 H(102)C(62)N(4)O(48)S(1) 1702.538423 H(102)C(62)N(4)O(48)S(1) N-linked glycosylation 1503 0.5 +Hex(8)HexNAc(2)@N 1702.581333 1703.5098 H(106)C(64)N(2)O(50) 1702.581333 H(106)C(64)N(2)O(50) N-linked glycosylation 1504 0.5 +dHex(1)Hex(3)HexNAc(4)Pent(2)@N 1708.618387 1709.5623 H(108)C(66)N(4)O(47) 1708.618387 H(108)C(66)N(4)O(47) N-linked glycosylation 1505 0.5 +dHex(1)Hex(4)HexNAc(3)NeuGc(1)@N 1710.597652 1711.5351 H(106)C(65)N(4)O(48) 1710.597652 H(106)C(65)N(4)O(48) N-linked glycosylation 1506 0.5 +dHex(2)Hex(3)HexNAc(4)Pent(1)@N 1722.634037 1723.5889 H(110)C(67)N(4)O(47) 1722.634037 H(110)C(67)N(4)O(47) N-linked glycosylation 1507 0.5 +dHex(1)Hex(3)HexNAc(5)Sulf(1)@N 1727.570057 1728.5888 H(105)C(64)N(5)O(47)S(1) 1727.570057 H(105)C(64)N(5)O(47)S(1) N-linked glycosylation 1508 0.5 +dHex(1)Hex(6)HexNAc(3)@N 1727.612968 1728.5624 H(109)C(66)N(3)O(49) 1727.612968 H(109)C(66)N(3)O(49) N-linked glycosylation 1509 0.5 +dHex(1)Hex(3)HexNAc(4)NeuAc(1)@N 1735.629286 1736.5877 H(109)C(67)N(5)O(47) 1735.629286 H(109)C(67)N(5)O(47) N-linked glycosylation 1510 0.5 +dHex(3)Hex(3)HexNAc(4)@N 1736.649688 1737.6155 H(112)C(68)N(4)O(47) 1736.649688 H(112)C(68)N(4)O(47) N-linked glycosylation 1511 0.5 +dHex(1)Hex(4)HexNAc(4)Pent(1)@N 1738.628952 1739.5883 H(110)C(67)N(4)O(48) 1738.628952 H(110)C(67)N(4)O(48) N-linked glycosylation 1512 0.5 +Hex(4)HexNAc(5)Sulf(1)@N 1743.564972 1744.5882 H(105)C(64)N(5)O(48)S(1) 1743.564972 H(105)C(64)N(5)O(48)S(1) N-linked glycosylation 1513 0.5 +Hex(7)HexNAc(3)@N 1743.607882 1744.5618 H(109)C(66)N(3)O(50) 1743.607882 H(109)C(66)N(3)O(50) N-linked glycosylation 1514 0.5 +dHex(1)Hex(4)HexNAc(3)NeuAc(1)Sulf(1)@N 1774.559552 1775.5989 H(106)C(65)N(4)O(50)S(1) 1774.559552 H(106)C(65)N(4)O(50)S(1) N-linked glycosylation 1515 0.5 +Hex(5)HexNAc(4)Me(2)Pent(1)@N 1782.655167 1783.6409 H(114)C(69)N(4)O(49) 1782.655167 H(114)C(69)N(4)O(49) N-linked glycosylation 1516 0.5 +Hex(3)HexNAc(6)Sulf(1)@N 1784.591521 1785.6401 H(108)C(66)N(6)O(48)S(1) 1784.591521 H(108)C(66)N(6)O(48)S(1) N-linked glycosylation 1517 0.5 +dHex(1)Hex(6)HexNAc(3)Sulf(1)@N 1807.569782 1808.6256 H(109)C(66)N(3)O(52)S(1) 1807.569782 H(109)C(66)N(3)O(52)S(1) N-linked glycosylation 1518 0.5 +dHex(1)Hex(4)HexNAc(5)@N 1809.666066 1810.6662 H(115)C(70)N(5)O(49) 1809.666066 H(115)C(70)N(5)O(49) N-linked glycosylation 1519 0.5 +dHex(1)Hex(5)HexA(1)HexNAc(3)Sulf(1)@N 1821.549047 1822.6091 H(107)C(66)N(3)O(53)S(1) 1821.549047 H(107)C(66)N(3)O(53)S(1) N-linked glycosylation 1520 0.5 +Hex(7)HexNAc(3)Phos(1)@N 1823.574213 1824.5417 H(110)C(66)N(3)O(53)P(1) 1823.574213 H(110)C(66)N(3)O(53)P(1) N-linked glycosylation 1521 0.5 +Hex(6)HexNAc(4)Me(3)@N 1826.681382 1827.6934 H(118)C(71)N(4)O(50) 1826.681382 H(118)C(71)N(4)O(50) N-linked glycosylation 1522 0.5 +dHex(2)Hex(4)HexNAc(4)Sulf(1)@N 1832.601417 1833.6781 H(112)C(68)N(4)O(51)S(1) 1832.601417 H(112)C(68)N(4)O(51)S(1) N-linked glycosylation 1523 0.5 +Hex(4)HexNAc(3)NeuAc(2)@N 1839.640245 1840.6491 H(113)C(70)N(5)O(51) 1839.640245 H(113)C(70)N(5)O(51) N-linked glycosylation 1524 0.5 +dHex(1)Hex(3)HexNAc(4)Pent(3)@N 1840.660646 1841.6769 H(116)C(71)N(4)O(51) 1840.660646 H(116)C(71)N(4)O(51) N-linked glycosylation 1525 0.5 +dHex(2)Hex(5)HexNAc(3)Pent(1)@N 1843.660312 1844.6776 H(117)C(71)N(3)O(52) 1843.660312 H(117)C(71)N(3)O(52) N-linked glycosylation 1526 0.5 +dHex(1)Hex(5)HexNAc(4)Sulf(1)@N 1848.596331 1849.6775 H(112)C(68)N(4)O(52)S(1) 1848.596331 H(112)C(68)N(4)O(52)S(1) N-linked glycosylation 1527 0.5 +dHex(2)Hex(3)HexNAc(4)Pent(2)@N 1854.676296 1855.7035 H(118)C(72)N(4)O(51) 1854.676296 H(118)C(72)N(4)O(51) N-linked glycosylation 1528 0.5 +dHex(1)Hex(5)HexNAc(3)NeuAc(1)@N 1856.655561 1857.6763 H(116)C(71)N(4)O(52) 1856.655561 H(116)C(71)N(4)O(52) N-linked glycosylation 1529 0.5 +Hex(3)HexNAc(6)Sulf(2)@N 1864.548335 1865.7033 H(108)C(66)N(6)O(51)S(2) 1864.548335 H(108)C(66)N(6)O(51)S(2) N-linked glycosylation 1530 0.5 +Hex(9)HexNAc(2)@N 1864.634157 1865.6504 H(116)C(70)N(2)O(55) 1864.634157 H(116)C(70)N(2)O(55) N-linked glycosylation 1531 0.5 +Hex(4)HexNAc(6)@N 1866.68753 1867.7175 H(118)C(72)N(6)O(50) 1866.68753 H(118)C(72)N(6)O(50) N-linked glycosylation 1532 0.5 +dHex(3)Hex(3)HexNAc(4)Pent(1)@N 1868.691946 1869.7301 H(120)C(73)N(4)O(51) 1868.691946 H(120)C(73)N(4)O(51) N-linked glycosylation 1533 0.5 +dHex(1)Hex(5)HexNAc(3)NeuGc(1)@N 1872.650475 1873.6757 H(116)C(71)N(4)O(53) 1872.650475 H(116)C(71)N(4)O(53) N-linked glycosylation 1534 0.5 +dHex(2)Hex(4)HexNAc(4)Pent(1)@N 1884.686861 1885.7295 H(120)C(73)N(4)O(52) 1884.686861 H(120)C(73)N(4)O(52) N-linked glycosylation 1535 0.5 +dHex(1)Hex(4)HexNAc(5)Sulf(1)@N 1889.62288 1890.7294 H(115)C(70)N(5)O(52)S(1) 1889.62288 H(115)C(70)N(5)O(52)S(1) N-linked glycosylation 1536 0.5 +dHex(1)Hex(7)HexNAc(3)@N 1889.665791 1890.703 H(119)C(72)N(3)O(54) 1889.665791 H(119)C(72)N(3)O(54) N-linked glycosylation 1537 0.5 +dHex(1)Hex(5)HexNAc(4)Pent(1)@N 1900.681776 1901.7289 H(120)C(73)N(4)O(53) 1900.681776 H(120)C(73)N(4)O(53) N-linked glycosylation 1538 0.5 +dHex(1)Hex(5)HexA(1)HexNAc(3)Sulf(2)@N 1901.505861 1902.6723 H(107)C(66)N(3)O(56)S(2) 1901.505861 H(107)C(66)N(3)O(56)S(2) N-linked glycosylation 1539 0.5 +Hex(3)HexNAc(7)@N 1907.714079 1908.7694 H(121)C(74)N(7)O(50) 1907.714079 H(121)C(74)N(7)O(50) N-linked glycosylation 1540 0.5 +dHex(2)Hex(5)HexNAc(4)@N 1914.697426 1915.7555 H(122)C(74)N(4)O(53) 1914.697426 H(122)C(74)N(4)O(53) N-linked glycosylation 1541 0.5 +dHex(2)Hex(4)HexNAc(3)NeuAc(1)Sulf(1)@N 1920.617461 1921.7401 H(116)C(71)N(4)O(54)S(1) 1920.617461 H(116)C(71)N(4)O(54)S(1) N-linked glycosylation 1542 0.5 +dHex(1)Hex(5)HexNAc(4)Sulf(2)@N 1928.553146 1929.7407 H(112)C(68)N(4)O(55)S(2) 1928.553146 H(112)C(68)N(4)O(55)S(2) N-linked glycosylation 1543 0.5 +dHex(1)Hex(5)HexNAc(4)Me(2)Pent(1)@N 1928.713076 1929.7821 H(124)C(75)N(4)O(53) 1928.713076 H(124)C(75)N(4)O(53) N-linked glycosylation 1544 0.5 +Hex(5)HexNAc(4)NeuGc(1)@N 1929.671939 1930.7271 H(119)C(73)N(5)O(54) 1929.671939 H(119)C(73)N(5)O(54) N-linked glycosylation 1545 0.5 +dHex(1)Hex(3)HexNAc(6)Sulf(1)@N 1930.64943 1931.7813 H(118)C(72)N(6)O(52)S(1) 1930.64943 H(118)C(72)N(6)O(52)S(1) N-linked glycosylation 1546 0.5 +dHex(1)Hex(6)HexNAc(4)@N 1930.69234 1931.7549 H(122)C(74)N(4)O(54) 1930.69234 H(122)C(74)N(4)O(54) N-linked glycosylation 1547 0.5 +dHex(1)Hex(5)HexNAc(3)NeuAc(1)Sulf(1)@N 1936.612375 1937.7395 H(116)C(71)N(4)O(55)S(1) 1936.612375 H(116)C(71)N(4)O(55)S(1) N-linked glycosylation 1548 0.5 +Hex(7)HexNAc(4)@N 1946.687255 1947.7543 H(122)C(74)N(4)O(55) 1946.687255 H(122)C(74)N(4)O(55) N-linked glycosylation 1549 0.5 +dHex(1)Hex(5)HexNAc(3)NeuGc(1)Sulf(1)@N 1952.60729 1953.7389 H(116)C(71)N(4)O(56)S(1) 1952.60729 H(116)C(71)N(4)O(56)S(1) N-linked glycosylation 1550 0.5 +Hex(4)HexNAc(5)NeuAc(1)@N 1954.703574 1955.7796 H(122)C(75)N(6)O(53) 1954.703574 H(122)C(75)N(6)O(53) N-linked glycosylation 1551 0.5 +Hex(6)HexNAc(4)Me(3)Pent(1)@N 1958.72364 1959.808 H(126)C(76)N(4)O(54) 1958.72364 H(126)C(76)N(4)O(54) N-linked glycosylation 1552 0.5 +dHex(1)Hex(7)HexNAc(3)Sulf(1)@N 1969.622606 1970.7662 H(119)C(72)N(3)O(57)S(1) 1969.622606 H(119)C(72)N(3)O(57)S(1) N-linked glycosylation 1553 0.5 +dHex(1)Hex(7)HexNAc(3)Phos(1)@N 1969.632122 1970.6829 H(120)C(72)N(3)O(57)P(1) 1969.632122 H(120)C(72)N(3)O(57)P(1) N-linked glycosylation 1554 0.5 +dHex(1)Hex(5)HexNAc(5)@N 1971.718889 1972.8068 H(125)C(76)N(5)O(54) 1971.718889 H(125)C(76)N(5)O(54) N-linked glycosylation 1555 0.5 +dHex(1)Hex(4)HexNAc(4)NeuAc(1)Sulf(1)@N 1977.638925 1978.7915 H(119)C(73)N(5)O(55)S(1) 1977.638925 H(119)C(73)N(5)O(55)S(1) N-linked glycosylation 1556 0.5 +dHex(3)Hex(4)HexNAc(4)Sulf(1)@N 1978.659326 1979.8193 H(122)C(74)N(4)O(55)S(1) 1978.659326 H(122)C(74)N(4)O(55)S(1) N-linked glycosylation 1557 0.5 +Hex(3)HexNAc(7)Sulf(1)@N 1987.670893 1988.8326 H(121)C(74)N(7)O(53)S(1) 1987.670893 H(121)C(74)N(7)O(53)S(1) N-linked glycosylation 1558 0.5 +Hex(6)HexNAc(5)@N 1987.713804 1988.8062 H(125)C(76)N(5)O(55) 1987.713804 H(125)C(76)N(5)O(55) N-linked glycosylation 1559 0.5 +Hex(5)HexNAc(4)NeuAc(1)Sulf(1)@N 1993.633839 1994.7909 H(119)C(73)N(5)O(56)S(1) 1993.633839 H(119)C(73)N(5)O(56)S(1) N-linked glycosylation 1560 0.5 +Hex(3)HexNAc(6)NeuAc(1)@N 1995.730123 1996.8315 H(125)C(77)N(7)O(53) 1995.730123 H(125)C(77)N(7)O(53) N-linked glycosylation 1561 0.5 +dHex(2)Hex(3)HexNAc(6)@N 1996.750524 1997.8593 H(128)C(78)N(6)O(53) 1996.750524 H(128)C(78)N(6)O(53) N-linked glycosylation 1562 0.5 +Hex(1)HexNAc(1)NeuGc(1)@S 672.222527 672.5871 H(40)C(25)N(2)O(19) 672.222527 H(40)C(25)N(2)O(19) O-linked glycosylation 1563 0.5 +Hex(1)HexNAc(1)NeuGc(1)@T 672.222527 672.5871 H(40)C(25)N(2)O(19) 672.222527 H(40)C(25)N(2)O(19) O-linked glycosylation 1563 0.5 +dHex(1)Hex(2)HexNAc(1)@S 673.242928 673.6149 H(43)C(26)N(1)O(19) 673.242928 H(43)C(26)N(1)O(19) O-linked glycosylation 1564 0.5 +dHex(1)Hex(2)HexNAc(1)@T 673.242928 673.6149 H(43)C(26)N(1)O(19) 673.242928 H(43)C(26)N(1)O(19) O-linked glycosylation 1564 0.5 +HexNAc(3)Sulf(1)@T 689.194932 689.6408 H(39)C(24)N(3)O(18)S(1) 689.194932 H(39)C(24)N(3)O(18)S(1) O-linked glycosylation 1565 0.5 +HexNAc(3)Sulf(1)@S 689.194932 689.6408 H(39)C(24)N(3)O(18)S(1) 689.194932 H(39)C(24)N(3)O(18)S(1) O-linked glycosylation 1565 0.5 +Hex(3)HexNAc(1)@T 689.237843 689.6143 H(43)C(26)N(1)O(20) 689.237843 H(43)C(26)N(1)O(20) O-linked glycosylation 1566 0.5 +Hex(3)HexNAc(1)@S 689.237843 689.6143 H(43)C(26)N(1)O(20) 689.237843 H(43)C(26)N(1)O(20) O-linked glycosylation 1566 0.5 +Hex(3)HexNAc(1)@N 689.237843 689.6143 H(43)C(26)N(1)O(20) 689.237843 H(43)C(26)N(1)O(20) N-linked glycosylation 1566 0.5 +Hex(1)HexNAc(1)Kdn(1)Sulf(1)@T 695.157878 695.599 H(37)C(23)N(1)O(21)S(1) 695.157878 H(37)C(23)N(1)O(21)S(1) O-linked glycosylation 1567 0.5 +Hex(1)HexNAc(1)Kdn(1)Sulf(1)@S 695.157878 695.599 H(37)C(23)N(1)O(21)S(1) 695.157878 H(37)C(23)N(1)O(21)S(1) O-linked glycosylation 1567 0.5 +HexNAc(2)NeuAc(1)@S 697.254162 697.6396 H(43)C(27)N(3)O(18) 697.254162 H(43)C(27)N(3)O(18) O-linked glycosylation 1568 0.5 +HexNAc(2)NeuAc(1)@T 697.254162 697.6396 H(43)C(27)N(3)O(18) 697.254162 H(43)C(27)N(3)O(18) O-linked glycosylation 1568 0.5 +HexNAc(1)Kdn(2)@T 703.217108 703.5978 H(41)C(26)N(1)O(21) 703.217108 H(41)C(26)N(1)O(21) O-linked glycosylation 1570 0.5 +HexNAc(1)Kdn(2)@S 703.217108 703.5978 H(41)C(26)N(1)O(21) 703.217108 H(41)C(26)N(1)O(21) O-linked glycosylation 1570 0.5 +Hex(3)HexNAc(1)Me(1)@S 703.253493 703.6409 H(45)C(27)N(1)O(20) 703.253493 H(45)C(27)N(1)O(20) O-linked glycosylation 1571 0.5 +Hex(3)HexNAc(1)Me(1)@T 703.253493 703.6409 H(45)C(27)N(1)O(20) 703.253493 H(45)C(27)N(1)O(20) O-linked glycosylation 1571 0.5 +Hex(2)HexA(1)Pent(1)Sulf(1)@T 712.136808 712.5831 H(36)C(23)O(23)S(1) 712.136808 H(36)C(23)O(23)S(1) O-linked glycosylation 1572 0.5 +Hex(2)HexA(1)Pent(1)Sulf(1)@S 712.136808 712.5831 H(36)C(23)O(23)S(1) 712.136808 H(36)C(23)O(23)S(1) O-linked glycosylation 1572 0.5 +HexNAc(2)NeuGc(1)@S 713.249076 713.639 H(43)C(27)N(3)O(19) 713.249076 H(43)C(27)N(3)O(19) O-linked glycosylation 1573 0.5 +HexNAc(2)NeuGc(1)@T 713.249076 713.639 H(43)C(27)N(3)O(19) 713.249076 H(43)C(27)N(3)O(19) O-linked glycosylation 1573 0.5 +Hex(4)Phos(1)@T 728.177625 728.5423 H(41)C(24)O(23)P(1) 728.177625 H(41)C(24)O(23)P(1) O-linked glycosylation 1575 0.5 +Hex(4)Phos(1)@S 728.177625 728.5423 H(41)C(24)O(23)P(1) 728.177625 H(41)C(24)O(23)P(1) O-linked glycosylation 1575 0.5 +Hex(1)HexNAc(1)NeuAc(1)Sulf(1)@T 736.184427 736.6509 H(40)C(25)N(2)O(21)S(1) 736.184427 H(40)C(25)N(2)O(21)S(1) O-linked glycosylation 1577 0.5 +Hex(1)HexNAc(1)NeuAc(1)Sulf(1)@S 736.184427 736.6509 H(40)C(25)N(2)O(21)S(1) 736.184427 H(40)C(25)N(2)O(21)S(1) O-linked glycosylation 1577 0.5 +Hex(1)HexA(1)HexNAc(2)@S 744.243657 744.6498 H(44)C(28)N(2)O(21) 744.243657 H(44)C(28)N(2)O(21) O-linked glycosylation 1578 0.5 +Hex(1)HexA(1)HexNAc(2)@T 744.243657 744.6498 H(44)C(28)N(2)O(21) 744.243657 H(44)C(28)N(2)O(21) O-linked glycosylation 1578 0.5 +dHex(1)Hex(2)HexNAc(1)Sulf(1)@T 753.199743 753.6781 H(43)C(26)N(1)O(22)S(1) 753.199743 H(43)C(26)N(1)O(22)S(1) O-linked glycosylation 1579 0.5 +dHex(1)Hex(2)HexNAc(1)Sulf(1)@S 753.199743 753.6781 H(43)C(26)N(1)O(22)S(1) 753.199743 H(43)C(26)N(1)O(22)S(1) O-linked glycosylation 1579 0.5 +dHex(1)HexNAc(3)@S 755.296027 755.7188 H(49)C(30)N(3)O(19) 755.296027 H(49)C(30)N(3)O(19) O-linked glycosylation 1580 0.5 +dHex(1)HexNAc(3)@T 755.296027 755.7188 H(49)C(30)N(3)O(19) 755.296027 H(49)C(30)N(3)O(19) O-linked glycosylation 1580 0.5 +dHex(1)Hex(1)HexNAc(1)Kdn(1)@T 761.258973 761.677 H(47)C(29)N(1)O(22) 761.258973 H(47)C(29)N(1)O(22) O-linked glycosylation 1581 0.5 +dHex(1)Hex(1)HexNAc(1)Kdn(1)@S 761.258973 761.677 H(47)C(29)N(1)O(22) 761.258973 H(47)C(29)N(1)O(22) O-linked glycosylation 1581 0.5 +Hex(1)HexNAc(3)@S 771.290941 771.7182 H(49)C(30)N(3)O(20) 771.290941 H(49)C(30)N(3)O(20) O-linked glycosylation 1582 0.5 +Hex(1)HexNAc(3)@T 771.290941 771.7182 H(49)C(30)N(3)O(20) 771.290941 H(49)C(30)N(3)O(20) O-linked glycosylation 1582 0.5 +HexNAc(2)NeuAc(1)Sulf(1)@T 777.210976 777.7028 H(43)C(27)N(3)O(21)S(1) 777.210976 H(43)C(27)N(3)O(21)S(1) O-linked glycosylation 1583 0.5 +HexNAc(2)NeuAc(1)Sulf(1)@S 777.210976 777.7028 H(43)C(27)N(3)O(21)S(1) 777.210976 H(43)C(27)N(3)O(21)S(1) O-linked glycosylation 1583 0.5 +dHex(2)Hex(3)@S 778.274288 778.7042 H(50)C(30)O(23) 778.274288 H(50)C(30)O(23) O-linked glycosylation 1584 0.5 +dHex(2)Hex(3)@T 778.274288 778.7042 H(50)C(30)O(23) 778.274288 H(50)C(30)O(23) O-linked glycosylation 1584 0.5 +Hex(2)HexA(1)HexNAc(1)Sulf(1)@T 783.173922 783.661 H(41)C(26)N(1)O(24)S(1) 783.173922 H(41)C(26)N(1)O(24)S(1) O-linked glycosylation 1585 0.5 +Hex(2)HexA(1)HexNAc(1)Sulf(1)@S 783.173922 783.661 H(41)C(26)N(1)O(24)S(1) 783.173922 H(41)C(26)N(1)O(24)S(1) O-linked glycosylation 1585 0.5 +dHex(2)Hex(2)HexA(1)@S 792.253553 792.6877 H(48)C(30)O(24) 792.253553 H(48)C(30)O(24) O-linked glycosylation 1586 0.5 +dHex(2)Hex(2)HexA(1)@T 792.253553 792.6877 H(48)C(30)O(24) 792.253553 H(48)C(30)O(24) O-linked glycosylation 1586 0.5 +dHex(1)Hex(1)HexNAc(2)Sulf(1)@T 794.226292 794.73 H(46)C(28)N(2)O(22)S(1) 794.226292 H(46)C(28)N(2)O(22)S(1) O-linked glycosylation 1587 0.5 +dHex(1)Hex(1)HexNAc(2)Sulf(1)@S 794.226292 794.73 H(46)C(28)N(2)O(22)S(1) 794.226292 H(46)C(28)N(2)O(22)S(1) O-linked glycosylation 1587 0.5 +dHex(1)Hex(1)HexNAc(1)NeuAc(1)@S 802.285522 802.7289 H(50)C(31)N(2)O(22) 802.285522 H(50)C(31)N(2)O(22) O-linked glycosylation 1588 0.5 +dHex(1)Hex(1)HexNAc(1)NeuAc(1)@T 802.285522 802.7289 H(50)C(31)N(2)O(22) 802.285522 H(50)C(31)N(2)O(22) O-linked glycosylation 1588 0.5 +Hex(2)HexNAc(2)Sulf(1)@T 810.221207 810.7294 H(46)C(28)N(2)O(23)S(1) 810.221207 H(46)C(28)N(2)O(23)S(1) O-linked glycosylation 1589 0.5 +Hex(2)HexNAc(2)Sulf(1)@S 810.221207 810.7294 H(46)C(28)N(2)O(23)S(1) 810.221207 H(46)C(28)N(2)O(23)S(1) O-linked glycosylation 1589 0.5 +Hex(5)@S 810.264117 810.703 H(50)C(30)O(25) 810.264117 H(50)C(30)O(25) O-linked glycosylation 1590 0.5 +Hex(5)@T 810.264117 810.703 H(50)C(30)O(25) 810.264117 H(50)C(30)O(25) O-linked glycosylation 1590 0.5 +HexNAc(4)@S 812.31749 812.7701 H(52)C(32)N(4)O(20) 812.31749 H(52)C(32)N(4)O(20) O-linked glycosylation 1591 0.5 +HexNAc(4)@T 812.31749 812.7701 H(52)C(32)N(4)O(20) 812.31749 H(52)C(32)N(4)O(20) O-linked glycosylation 1591 0.5 +HexNAc(1)NeuGc(2)@S 817.260035 817.7005 H(47)C(30)N(3)O(23) 817.260035 H(47)C(30)N(3)O(23) O-linked glycosylation 1592 0.5 +HexNAc(1)NeuGc(2)@T 817.260035 817.7005 H(47)C(30)N(3)O(23) 817.260035 H(47)C(30)N(3)O(23) O-linked glycosylation 1592 0.5 +dHex(1)Hex(1)HexNAc(1)NeuGc(1)@T 818.280436 818.7283 H(50)C(31)N(2)O(23) 818.280436 H(50)C(31)N(2)O(23) O-linked glycosylation 1593 0.5 +dHex(1)Hex(1)HexNAc(1)NeuGc(1)@S 818.280436 818.7283 H(50)C(31)N(2)O(23) 818.280436 H(50)C(31)N(2)O(23) O-linked glycosylation 1593 0.5 +dHex(2)Hex(2)HexNAc(1)@S 819.300837 819.7561 H(53)C(32)N(1)O(23) 819.300837 H(53)C(32)N(1)O(23) O-linked glycosylation 1594 0.5 +dHex(2)Hex(2)HexNAc(1)@T 819.300837 819.7561 H(53)C(32)N(1)O(23) 819.300837 H(53)C(32)N(1)O(23) O-linked glycosylation 1594 0.5 +Hex(2)HexNAc(1)NeuGc(1)@S 834.275351 834.7277 H(50)C(31)N(2)O(24) 834.275351 H(50)C(31)N(2)O(24) O-linked glycosylation 1595 0.5 +Hex(2)HexNAc(1)NeuGc(1)@T 834.275351 834.7277 H(50)C(31)N(2)O(24) 834.275351 H(50)C(31)N(2)O(24) O-linked glycosylation 1595 0.5 +dHex(1)Hex(3)HexNAc(1)@S 835.295752 835.7555 H(53)C(32)N(1)O(24) 835.295752 H(53)C(32)N(1)O(24) O-linked glycosylation 1596 0.5 +dHex(1)Hex(3)HexNAc(1)@T 835.295752 835.7555 H(53)C(32)N(1)O(24) 835.295752 H(53)C(32)N(1)O(24) O-linked glycosylation 1596 0.5 +dHex(1)Hex(2)HexA(1)HexNAc(1)@S 849.275017 849.739 H(51)C(32)N(1)O(25) 849.275017 H(51)C(32)N(1)O(25) O-linked glycosylation 1597 0.5 +dHex(1)Hex(2)HexA(1)HexNAc(1)@T 849.275017 849.739 H(51)C(32)N(1)O(25) 849.275017 H(51)C(32)N(1)O(25) O-linked glycosylation 1597 0.5 +Hex(1)HexNAc(3)Sulf(1)@T 851.247756 851.7814 H(49)C(30)N(3)O(23)S(1) 851.247756 H(49)C(30)N(3)O(23)S(1) O-linked glycosylation 1598 0.5 +Hex(1)HexNAc(3)Sulf(1)@S 851.247756 851.7814 H(49)C(30)N(3)O(23)S(1) 851.247756 H(49)C(30)N(3)O(23)S(1) O-linked glycosylation 1598 0.5 +Hex(4)HexNAc(1)@T 851.290667 851.7549 H(53)C(32)N(1)O(25) 851.290667 H(53)C(32)N(1)O(25) O-linked glycosylation 1599 0.5 +Hex(4)HexNAc(1)@S 851.290667 851.7549 H(53)C(32)N(1)O(25) 851.290667 H(53)C(32)N(1)O(25) O-linked glycosylation 1599 0.5 +Hex(4)HexNAc(1)@N 851.290667 851.7549 H(53)C(32)N(1)O(25) 851.290667 H(53)C(32)N(1)O(25) N-linked glycosylation 1599 0.5 +Hex(1)HexNAc(2)NeuAc(1)@S 859.306985 859.7802 H(53)C(33)N(3)O(23) 859.306985 H(53)C(33)N(3)O(23) O-linked glycosylation 1600 0.5 +Hex(1)HexNAc(2)NeuAc(1)@T 859.306985 859.7802 H(53)C(33)N(3)O(23) 859.306985 H(53)C(33)N(3)O(23) O-linked glycosylation 1600 0.5 +Hex(1)HexNAc(2)NeuGc(1)@S 875.3019 875.7796 H(53)C(33)N(3)O(24) 875.3019 H(53)C(33)N(3)O(24) O-linked glycosylation 1602 0.5 +Hex(1)HexNAc(2)NeuGc(1)@T 875.3019 875.7796 H(53)C(33)N(3)O(24) 875.3019 H(53)C(33)N(3)O(24) O-linked glycosylation 1602 0.5 +Hex(5)Phos(1)@T 890.230448 890.6829 H(51)C(30)O(28)P(1) 890.230448 H(51)C(30)O(28)P(1) O-linked glycosylation 1604 0.5 +Hex(5)Phos(1)@S 890.230448 890.6829 H(51)C(30)O(28)P(1) 890.230448 H(51)C(30)O(28)P(1) O-linked glycosylation 1604 0.5 +dHex(2)Hex(1)HexNAc(1)Kdn(1)@T 907.316881 907.8182 H(57)C(35)N(1)O(26) 907.316881 H(57)C(35)N(1)O(26) O-linked glycosylation 1606 0.5 +dHex(2)Hex(1)HexNAc(1)Kdn(1)@S 907.316881 907.8182 H(57)C(35)N(1)O(26) 907.316881 H(57)C(35)N(1)O(26) O-linked glycosylation 1606 0.5 +dHex(1)Hex(3)HexNAc(1)Sulf(1)@T 915.252567 915.8187 H(53)C(32)N(1)O(27)S(1) 915.252567 H(53)C(32)N(1)O(27)S(1) O-linked glycosylation 1607 0.5 +dHex(1)Hex(3)HexNAc(1)Sulf(1)@S 915.252567 915.8187 H(53)C(32)N(1)O(27)S(1) 915.252567 H(53)C(32)N(1)O(27)S(1) O-linked glycosylation 1607 0.5 +dHex(1)Hex(1)HexNAc(3)@S 917.34885 917.8594 H(59)C(36)N(3)O(24) 917.34885 H(59)C(36)N(3)O(24) O-linked glycosylation 1608 0.5 +dHex(1)Hex(1)HexNAc(3)@T 917.34885 917.8594 H(59)C(36)N(3)O(24) 917.34885 H(59)C(36)N(3)O(24) O-linked glycosylation 1608 0.5 +dHex(1)Hex(2)HexA(1)HexNAc(1)Sulf(1)@T 929.231831 929.8022 H(51)C(32)N(1)O(28)S(1) 929.231831 H(51)C(32)N(1)O(28)S(1) O-linked glycosylation 1609 0.5 +dHex(1)Hex(2)HexA(1)HexNAc(1)Sulf(1)@S 929.231831 929.8022 H(51)C(32)N(1)O(28)S(1) 929.231831 H(51)C(32)N(1)O(28)S(1) O-linked glycosylation 1609 0.5 +Hex(2)HexNAc(3)@S 933.343765 933.8588 H(59)C(36)N(3)O(25) 933.343765 H(59)C(36)N(3)O(25) O-linked glycosylation 1610 0.5 +Hex(2)HexNAc(3)@N 933.343765 933.8588 H(59)C(36)N(3)O(25) 933.343765 H(59)C(36)N(3)O(25) N-linked glycosylation 1610 0.5 +Hex(2)HexNAc(3)@T 933.343765 933.8588 H(59)C(36)N(3)O(25) 933.343765 H(59)C(36)N(3)O(25) O-linked glycosylation 1610 0.5 +Hex(1)HexNAc(2)NeuAc(1)Sulf(1)@T 939.2638 939.8434 H(53)C(33)N(3)O(26)S(1) 939.2638 H(53)C(33)N(3)O(26)S(1) O-linked glycosylation 1611 0.5 +Hex(1)HexNAc(2)NeuAc(1)Sulf(1)@S 939.2638 939.8434 H(53)C(33)N(3)O(26)S(1) 939.2638 H(53)C(33)N(3)O(26)S(1) O-linked glycosylation 1611 0.5 +dHex(2)Hex(4)@S 940.327112 940.8448 H(60)C(36)O(28) 940.327112 H(60)C(36)O(28) O-linked glycosylation 1612 0.5 +dHex(2)Hex(4)@T 940.327112 940.8448 H(60)C(36)O(28) 940.327112 H(60)C(36)O(28) O-linked glycosylation 1612 0.5 +Hex(1)HexNAc(1)NeuAc(1)Ac(1)@T 698.238177 698.6244 H(42)C(27)N(2)O(19) 698.238177 H(42)C(27)N(2)O(19) O-linked glycosylation 1786 0.5 +Hex(1)HexNAc(1)NeuAc(1)Ac(1)@S 698.238177 698.6244 H(42)C(27)N(2)O(19) 698.238177 H(42)C(27)N(2)O(19) O-linked glycosylation 1786 0.5 +dHex(2)HexNAc(2)Kdn(1)@T 948.34343 948.8701 H(60)C(37)N(2)O(26) 948.34343 H(60)C(37)N(2)O(26) O-linked glycosylation 1614 0.5 +dHex(2)HexNAc(2)Kdn(1)@S 948.34343 948.8701 H(60)C(37)N(2)O(26) 948.34343 H(60)C(37)N(2)O(26) O-linked glycosylation 1614 0.5 +dHex(1)Hex(2)HexNAc(2)Sulf(1)@T 956.279116 956.8706 H(56)C(34)N(2)O(27)S(1) 956.279116 H(56)C(34)N(2)O(27)S(1) O-linked glycosylation 1615 0.5 +dHex(1)Hex(2)HexNAc(2)Sulf(1)@S 956.279116 956.8706 H(56)C(34)N(2)O(27)S(1) 956.279116 H(56)C(34)N(2)O(27)S(1) O-linked glycosylation 1615 0.5 +dHex(1)HexNAc(4)@S 958.375399 958.9113 H(62)C(38)N(4)O(24) 958.375399 H(62)C(38)N(4)O(24) O-linked glycosylation 1616 0.5 +dHex(1)HexNAc(4)@T 958.375399 958.9113 H(62)C(38)N(4)O(24) 958.375399 H(62)C(38)N(4)O(24) O-linked glycosylation 1616 0.5 +Hex(1)HexNAc(1)NeuAc(1)NeuGc(1)@S 963.317944 963.8417 H(57)C(36)N(3)O(27) 963.317944 H(57)C(36)N(3)O(27) O-linked glycosylation 1617 0.5 +Hex(1)HexNAc(1)NeuAc(1)NeuGc(1)@T 963.317944 963.8417 H(57)C(36)N(3)O(27) 963.317944 H(57)C(36)N(3)O(27) O-linked glycosylation 1617 0.5 +dHex(1)Hex(1)HexNAc(2)Kdn(1)@T 964.338345 964.8695 H(60)C(37)N(2)O(27) 964.338345 H(60)C(37)N(2)O(27) O-linked glycosylation 1618 0.5 +dHex(1)Hex(1)HexNAc(2)Kdn(1)@S 964.338345 964.8695 H(60)C(37)N(2)O(27) 964.338345 H(60)C(37)N(2)O(27) O-linked glycosylation 1618 0.5 +Hex(1)HexNAc(1)NeuGc(2)@S 979.312859 979.8411 H(57)C(36)N(3)O(28) 979.312859 H(57)C(36)N(3)O(28) O-linked glycosylation 1619 0.5 +Hex(1)HexNAc(1)NeuGc(2)@T 979.312859 979.8411 H(57)C(36)N(3)O(28) 979.312859 H(57)C(36)N(3)O(28) O-linked glycosylation 1619 0.5 +Hex(1)HexNAc(1)NeuAc(2)Ac(1)@T 989.333594 989.879 H(59)C(38)N(3)O(27) 989.333594 H(59)C(38)N(3)O(27) O-linked glycosylation 1620 0.5 +Hex(1)HexNAc(1)NeuAc(2)Ac(1)@S 989.333594 989.879 H(59)C(38)N(3)O(27) 989.333594 H(59)C(38)N(3)O(27) O-linked glycosylation 1620 0.5 +dHex(2)Hex(2)HexA(1)HexNAc(1)@S 995.332925 995.8802 H(61)C(38)N(1)O(29) 995.332925 H(61)C(38)N(1)O(29) O-linked glycosylation 1621 0.5 +dHex(2)Hex(2)HexA(1)HexNAc(1)@T 995.332925 995.8802 H(61)C(38)N(1)O(29) 995.332925 H(61)C(38)N(1)O(29) O-linked glycosylation 1621 0.5 +dHex(1)Hex(1)HexNAc(3)Sulf(1)@T 997.305665 997.9226 H(59)C(36)N(3)O(27)S(1) 997.305665 H(59)C(36)N(3)O(27)S(1) O-linked glycosylation 1622 0.5 +dHex(1)Hex(1)HexNAc(3)Sulf(1)@S 997.305665 997.9226 H(59)C(36)N(3)O(27)S(1) 997.305665 H(59)C(36)N(3)O(27)S(1) O-linked glycosylation 1622 0.5 +Hex(2)HexA(1)NeuAc(1)Pent(1)Sulf(1)@T 1003.232225 1003.8377 H(53)C(34)N(1)O(31)S(1) 1003.232225 H(53)C(34)N(1)O(31)S(1) O-linked glycosylation 1623 0.5 +Hex(2)HexA(1)NeuAc(1)Pent(1)Sulf(1)@S 1003.232225 1003.8377 H(53)C(34)N(1)O(31)S(1) 1003.232225 H(53)C(34)N(1)O(31)S(1) O-linked glycosylation 1623 0.5 +dHex(1)Hex(1)HexNAc(2)NeuAc(1)@S 1005.364894 1005.9214 H(63)C(39)N(3)O(27) 1005.364894 H(63)C(39)N(3)O(27) O-linked glycosylation 1624 0.5 +dHex(1)Hex(1)HexNAc(2)NeuAc(1)@T 1005.364894 1005.9214 H(63)C(39)N(3)O(27) 1005.364894 H(63)C(39)N(3)O(27) O-linked glycosylation 1624 0.5 +dHex(1)Hex(3)HexA(1)HexNAc(1)@S 1011.32784 1011.8796 H(61)C(38)N(1)O(30) 1011.32784 H(61)C(38)N(1)O(30) O-linked glycosylation 1625 0.5 +dHex(1)Hex(3)HexA(1)HexNAc(1)@T 1011.32784 1011.8796 H(61)C(38)N(1)O(30) 1011.32784 H(61)C(38)N(1)O(30) O-linked glycosylation 1625 0.5 +Hex(2)HexNAc(3)Sulf(1)@T 1013.300579 1013.922 H(59)C(36)N(3)O(28)S(1) 1013.300579 H(59)C(36)N(3)O(28)S(1) O-linked glycosylation 1626 0.5 +Hex(2)HexNAc(3)Sulf(1)@S 1013.300579 1013.922 H(59)C(36)N(3)O(28)S(1) 1013.300579 H(59)C(36)N(3)O(28)S(1) O-linked glycosylation 1626 0.5 +Hex(5)HexNAc(1)@T 1013.34349 1013.8955 H(63)C(38)N(1)O(30) 1013.34349 H(63)C(38)N(1)O(30) O-linked glycosylation 1627 0.5 +Hex(5)HexNAc(1)@S 1013.34349 1013.8955 H(63)C(38)N(1)O(30) 1013.34349 H(63)C(38)N(1)O(30) O-linked glycosylation 1627 0.5 +Hex(5)HexNAc(1)@N 1013.34349 1013.8955 H(63)C(38)N(1)O(30) 1013.34349 H(63)C(38)N(1)O(30) N-linked glycosylation 1627 0.5 +HexNAc(5)@S 1015.396863 1015.9626 H(65)C(40)N(5)O(25) 1015.396863 H(65)C(40)N(5)O(25) O-linked glycosylation 1628 0.5 +HexNAc(5)@T 1015.396863 1015.9626 H(65)C(40)N(5)O(25) 1015.396863 H(65)C(40)N(5)O(25) O-linked glycosylation 1628 0.5 +Hex(1)HexNAc(1)NeuAc(2)Ac(2)@T 1031.344159 1031.9156 H(61)C(40)N(3)O(28) 1031.344159 H(61)C(40)N(3)O(28) O-linked glycosylation 1630 0.5 +Hex(1)HexNAc(1)NeuAc(2)Ac(2)@S 1031.344159 1031.9156 H(61)C(40)N(3)O(28) 1031.344159 H(61)C(40)N(3)O(28) O-linked glycosylation 1630 0.5 +Hex(2)HexNAc(2)NeuGc(1)@S 1037.354723 1037.9202 H(63)C(39)N(3)O(29) 1037.354723 H(63)C(39)N(3)O(29) O-linked glycosylation 1631 0.5 +Hex(2)HexNAc(2)NeuGc(1)@T 1037.354723 1037.9202 H(63)C(39)N(3)O(29) 1037.354723 H(63)C(39)N(3)O(29) O-linked glycosylation 1631 0.5 +Hex(5)Phos(3)@T 1050.16311 1050.6427 H(53)C(30)O(34)P(3) 1050.16311 H(53)C(30)O(34)P(3) O-linked glycosylation 1632 0.5 +Hex(5)Phos(3)@S 1050.16311 1050.6427 H(53)C(30)O(34)P(3) 1050.16311 H(53)C(30)O(34)P(3) O-linked glycosylation 1632 0.5 +Hex(6)Phos(1)@T 1052.283272 1052.8235 H(61)C(36)O(33)P(1) 1052.283272 H(61)C(36)O(33)P(1) O-linked glycosylation 1633 0.5 +Hex(6)Phos(1)@S 1052.283272 1052.8235 H(61)C(36)O(33)P(1) 1052.283272 H(61)C(36)O(33)P(1) O-linked glycosylation 1633 0.5 +dHex(1)Hex(2)HexA(1)HexNAc(2)@S 1052.354389 1052.9316 H(64)C(40)N(2)O(30) 1052.354389 H(64)C(40)N(2)O(30) O-linked glycosylation 1634 0.5 +dHex(1)Hex(2)HexA(1)HexNAc(2)@T 1052.354389 1052.9316 H(64)C(40)N(2)O(30) 1052.354389 H(64)C(40)N(2)O(30) O-linked glycosylation 1634 0.5 +dHex(2)Hex(3)HexNAc(1)Sulf(1)@T 1061.310475 1061.9599 H(63)C(38)N(1)O(31)S(1) 1061.310475 H(63)C(38)N(1)O(31)S(1) O-linked glycosylation 1635 0.5 +dHex(2)Hex(3)HexNAc(1)Sulf(1)@S 1061.310475 1061.9599 H(63)C(38)N(1)O(31)S(1) 1061.310475 H(63)C(38)N(1)O(31)S(1) O-linked glycosylation 1635 0.5 +Hex(1)HexNAc(3)NeuAc(1)@S 1062.386358 1062.9727 H(66)C(41)N(4)O(28) 1062.386358 H(66)C(41)N(4)O(28) O-linked glycosylation 1636 0.5 +Hex(1)HexNAc(3)NeuAc(1)@T 1062.386358 1062.9727 H(66)C(41)N(4)O(28) 1062.386358 H(66)C(41)N(4)O(28) O-linked glycosylation 1636 0.5 +dHex(2)Hex(1)HexNAc(3)@S 1063.406759 1064.0006 H(69)C(42)N(3)O(28) 1063.406759 H(69)C(42)N(3)O(28) O-linked glycosylation 1637 0.5 +dHex(2)Hex(1)HexNAc(3)@T 1063.406759 1064.0006 H(69)C(42)N(3)O(28) 1063.406759 H(69)C(42)N(3)O(28) O-linked glycosylation 1637 0.5 +Hex(1)HexNAc(3)NeuGc(1)@S 1078.381273 1078.9721 H(66)C(41)N(4)O(29) 1078.381273 H(66)C(41)N(4)O(29) O-linked glycosylation 1638 0.5 +Hex(1)HexNAc(3)NeuGc(1)@T 1078.381273 1078.9721 H(66)C(41)N(4)O(29) 1078.381273 H(66)C(41)N(4)O(29) O-linked glycosylation 1638 0.5 +dHex(1)Hex(1)HexNAc(2)NeuAc(1)Sulf(1)@T 1085.321709 1085.9846 H(63)C(39)N(3)O(30)S(1) 1085.321709 H(63)C(39)N(3)O(30)S(1) O-linked glycosylation 1639 0.5 +dHex(1)Hex(1)HexNAc(2)NeuAc(1)Sulf(1)@S 1085.321709 1085.9846 H(63)C(39)N(3)O(30)S(1) 1085.321709 H(63)C(39)N(3)O(30)S(1) O-linked glycosylation 1639 0.5 +dHex(1)Hex(3)HexA(1)HexNAc(1)Sulf(1)@T 1091.284655 1091.9428 H(61)C(38)N(1)O(33)S(1) 1091.284655 H(61)C(38)N(1)O(33)S(1) O-linked glycosylation 1640 0.5 +dHex(1)Hex(3)HexA(1)HexNAc(1)Sulf(1)@S 1091.284655 1091.9428 H(61)C(38)N(1)O(33)S(1) 1091.284655 H(61)C(38)N(1)O(33)S(1) O-linked glycosylation 1640 0.5 +dHex(1)Hex(1)HexA(1)HexNAc(3)@S 1093.380938 1093.9835 H(67)C(42)N(3)O(30) 1093.380938 H(67)C(42)N(3)O(30) O-linked glycosylation 1641 0.5 +dHex(1)Hex(1)HexA(1)HexNAc(3)@T 1093.380938 1093.9835 H(67)C(42)N(3)O(30) 1093.380938 H(67)C(42)N(3)O(30) O-linked glycosylation 1641 0.5 +Hex(2)HexNAc(2)NeuAc(1)Sulf(1)@T 1101.316623 1101.984 H(63)C(39)N(3)O(31)S(1) 1101.316623 H(63)C(39)N(3)O(31)S(1) O-linked glycosylation 1642 0.5 +Hex(2)HexNAc(2)NeuAc(1)Sulf(1)@S 1101.316623 1101.984 H(63)C(39)N(3)O(31)S(1) 1101.316623 H(63)C(39)N(3)O(31)S(1) O-linked glycosylation 1642 0.5 +dHex(2)Hex(2)HexNAc(2)Sulf(1)@T 1102.337025 1103.0118 H(66)C(40)N(2)O(31)S(1) 1102.337025 H(66)C(40)N(2)O(31)S(1) O-linked glycosylation 1643 0.5 +dHex(2)Hex(2)HexNAc(2)Sulf(1)@S 1102.337025 1103.0118 H(66)C(40)N(2)O(31)S(1) 1102.337025 H(66)C(40)N(2)O(31)S(1) O-linked glycosylation 1643 0.5 +dHex(2)Hex(1)HexNAc(2)Kdn(1)@T 1110.396254 1111.0107 H(70)C(43)N(2)O(31) 1110.396254 H(70)C(43)N(2)O(31) O-linked glycosylation 1644 0.5 +dHex(2)Hex(1)HexNAc(2)Kdn(1)@S 1110.396254 1111.0107 H(70)C(43)N(2)O(31) 1110.396254 H(70)C(43)N(2)O(31) O-linked glycosylation 1644 0.5 +dHex(1)Hex(1)HexNAc(4)@S 1120.428223 1121.0519 H(72)C(44)N(4)O(29) 1120.428223 H(72)C(44)N(4)O(29) O-linked glycosylation 1645 0.5 +dHex(1)Hex(1)HexNAc(4)@T 1120.428223 1121.0519 H(72)C(44)N(4)O(29) 1120.428223 H(72)C(44)N(4)O(29) O-linked glycosylation 1645 0.5 +Hex(2)HexNAc(4)@T 1136.423137 1137.0513 H(72)C(44)N(4)O(30) 1136.423137 H(72)C(44)N(4)O(30) O-linked glycosylation 1646 0.5 +Hex(2)HexNAc(4)@S 1136.423137 1137.0513 H(72)C(44)N(4)O(30) 1136.423137 H(72)C(44)N(4)O(30) O-linked glycosylation 1646 0.5 +Hex(2)HexNAc(4)@N 1136.423137 1137.0513 H(72)C(44)N(4)O(30) 1136.423137 H(72)C(44)N(4)O(30) N-linked glycosylation 1646 0.5 +Hex(2)HexNAc(1)NeuGc(2)@S 1141.365682 1141.9817 H(67)C(42)N(3)O(33) 1141.365682 H(67)C(42)N(3)O(33) O-linked glycosylation 1647 0.5 +Hex(2)HexNAc(1)NeuGc(2)@T 1141.365682 1141.9817 H(67)C(42)N(3)O(33) 1141.365682 H(67)C(42)N(3)O(33) O-linked glycosylation 1647 0.5 +dHex(2)Hex(4)HexNAc(1)@S 1143.406484 1144.0373 H(73)C(44)N(1)O(33) 1143.406484 H(73)C(44)N(1)O(33) O-linked glycosylation 1648 0.5 +dHex(2)Hex(4)HexNAc(1)@T 1143.406484 1144.0373 H(73)C(44)N(1)O(33) 1143.406484 H(73)C(44)N(1)O(33) O-linked glycosylation 1648 0.5 +Hex(1)HexNAc(2)NeuAc(2)@S 1150.402402 1151.0348 H(70)C(44)N(4)O(31) 1150.402402 H(70)C(44)N(4)O(31) O-linked glycosylation 1649 0.5 +Hex(1)HexNAc(2)NeuAc(2)@T 1150.402402 1151.0348 H(70)C(44)N(4)O(31) 1150.402402 H(70)C(44)N(4)O(31) O-linked glycosylation 1649 0.5 +dHex(2)Hex(1)HexNAc(2)NeuAc(1)@S 1151.422803 1152.0626 H(73)C(45)N(3)O(31) 1151.422803 H(73)C(45)N(3)O(31) O-linked glycosylation 1650 0.5 +dHex(2)Hex(1)HexNAc(2)NeuAc(1)@T 1151.422803 1152.0626 H(73)C(45)N(3)O(31) 1151.422803 H(73)C(45)N(3)O(31) O-linked glycosylation 1650 0.5 +dHex(1)Hex(2)HexNAc(3)Sulf(1)@T 1159.358488 1160.0632 H(69)C(42)N(3)O(32)S(1) 1159.358488 H(69)C(42)N(3)O(32)S(1) O-linked glycosylation 1651 0.5 +dHex(1)Hex(2)HexNAc(3)Sulf(1)@S 1159.358488 1160.0632 H(69)C(42)N(3)O(32)S(1) 1159.358488 H(69)C(42)N(3)O(32)S(1) O-linked glycosylation 1651 0.5 +dHex(1)HexNAc(5)@S 1161.454772 1162.1038 H(75)C(46)N(5)O(29) 1161.454772 H(75)C(46)N(5)O(29) O-linked glycosylation 1652 0.5 +dHex(1)HexNAc(5)@T 1161.454772 1162.1038 H(75)C(46)N(5)O(29) 1161.454772 H(75)C(46)N(5)O(29) O-linked glycosylation 1652 0.5 +dHex(2)Hex(1)HexNAc(2)NeuGc(1)@T 1167.417718 1168.062 H(73)C(45)N(3)O(32) 1167.417718 H(73)C(45)N(3)O(32) O-linked glycosylation 1653 0.5 +dHex(2)Hex(1)HexNAc(2)NeuGc(1)@S 1167.417718 1168.062 H(73)C(45)N(3)O(32) 1167.417718 H(73)C(45)N(3)O(32) O-linked glycosylation 1653 0.5 +dHex(3)Hex(2)HexNAc(2)@S 1168.438119 1169.0898 H(76)C(46)N(2)O(32) 1168.438119 H(76)C(46)N(2)O(32) O-linked glycosylation 1654 0.5 +dHex(3)Hex(2)HexNAc(2)@T 1168.438119 1169.0898 H(76)C(46)N(2)O(32) 1168.438119 H(76)C(46)N(2)O(32) O-linked glycosylation 1654 0.5 +Hex(3)HexNAc(3)Sulf(1)@T 1175.353403 1176.0626 H(69)C(42)N(3)O(33)S(1) 1175.353403 H(69)C(42)N(3)O(33)S(1) O-linked glycosylation 1655 0.5 +Hex(3)HexNAc(3)Sulf(1)@S 1175.353403 1176.0626 H(69)C(42)N(3)O(33)S(1) 1175.353403 H(69)C(42)N(3)O(33)S(1) O-linked glycosylation 1655 0.5 +Hex(3)HexNAc(3)Sulf(1)@N 1175.353403 1176.0626 H(69)C(42)N(3)O(33)S(1) 1175.353403 H(69)C(42)N(3)O(33)S(1) N-linked glycosylation 1655 0.5 +dHex(2)Hex(2)HexNAc(2)Sulf(2)@T 1182.293839 1183.075 H(66)C(40)N(2)O(34)S(2) 1182.293839 H(66)C(40)N(2)O(34)S(2) O-linked glycosylation 1656 0.5 +dHex(2)Hex(2)HexNAc(2)Sulf(2)@S 1182.293839 1183.075 H(66)C(40)N(2)O(34)S(2) 1182.293839 H(66)C(40)N(2)O(34)S(2) O-linked glycosylation 1656 0.5 +dHex(1)Hex(2)HexNAc(2)NeuGc(1)@N 1183.412632 1184.0614 H(73)C(45)N(3)O(33) 1183.412632 H(73)C(45)N(3)O(33) N-linked glycosylation 1657 0.5 +dHex(1)Hex(2)HexNAc(2)NeuGc(1)@T 1183.412632 1184.0614 H(73)C(45)N(3)O(33) 1183.412632 H(73)C(45)N(3)O(33) O-linked glycosylation 1657 0.5 +dHex(1)Hex(2)HexNAc(2)NeuGc(1)@S 1183.412632 1184.0614 H(73)C(45)N(3)O(33) 1183.412632 H(73)C(45)N(3)O(33) O-linked glycosylation 1657 0.5 +dHex(1)Hex(1)HexNAc(3)NeuAc(1)@T 1208.444267 1209.1139 H(76)C(47)N(4)O(32) 1208.444267 H(76)C(47)N(4)O(32) O-linked glycosylation 1658 0.5 +dHex(1)Hex(1)HexNAc(3)NeuAc(1)@S 1208.444267 1209.1139 H(76)C(47)N(4)O(32) 1208.444267 H(76)C(47)N(4)O(32) O-linked glycosylation 1658 0.5 +Hex(6)Phos(3)@T 1212.215934 1212.7833 H(63)C(36)O(39)P(3) 1212.215934 H(63)C(36)O(39)P(3) O-linked glycosylation 1659 0.5 +Hex(6)Phos(3)@S 1212.215934 1212.7833 H(63)C(36)O(39)P(3) 1212.215934 H(63)C(36)O(39)P(3) O-linked glycosylation 1659 0.5 +dHex(1)Hex(3)HexA(1)HexNAc(2)@S 1214.407213 1215.0722 H(74)C(46)N(2)O(35) 1214.407213 H(74)C(46)N(2)O(35) O-linked glycosylation 1660 0.5 +dHex(1)Hex(3)HexA(1)HexNAc(2)@T 1214.407213 1215.0722 H(74)C(46)N(2)O(35) 1214.407213 H(74)C(46)N(2)O(35) O-linked glycosylation 1660 0.5 +dHex(1)Hex(1)HexNAc(3)NeuGc(1)@T 1224.439181 1225.1133 H(76)C(47)N(4)O(33) 1224.439181 H(76)C(47)N(4)O(33) O-linked glycosylation 1661 0.5 +dHex(1)Hex(1)HexNAc(3)NeuGc(1)@S 1224.439181 1225.1133 H(76)C(47)N(4)O(33) 1224.439181 H(76)C(47)N(4)O(33) O-linked glycosylation 1661 0.5 +Hex(1)HexNAc(2)NeuAc(2)Sulf(1)@T 1230.359217 1231.098 H(70)C(44)N(4)O(34)S(1) 1230.359217 H(70)C(44)N(4)O(34)S(1) O-linked glycosylation 1662 0.5 +Hex(1)HexNAc(2)NeuAc(2)Sulf(1)@S 1230.359217 1231.098 H(70)C(44)N(4)O(34)S(1) 1230.359217 H(70)C(44)N(4)O(34)S(1) O-linked glycosylation 1662 0.5 +dHex(2)Hex(3)HexA(1)HexNAc(1)Sulf(1)@T 1237.342563 1238.084 H(71)C(44)N(1)O(37)S(1) 1237.342563 H(71)C(44)N(1)O(37)S(1) O-linked glycosylation 1663 0.5 +dHex(2)Hex(3)HexA(1)HexNAc(1)Sulf(1)@S 1237.342563 1238.084 H(71)C(44)N(1)O(37)S(1) 1237.342563 H(71)C(44)N(1)O(37)S(1) O-linked glycosylation 1663 0.5 +Hex(1)HexNAc(1)NeuAc(3)@S 1238.418446 1239.0969 H(74)C(47)N(4)O(34) 1238.418446 H(74)C(47)N(4)O(34) O-linked glycosylation 1664 0.5 +Hex(1)HexNAc(1)NeuAc(3)@T 1238.418446 1239.0969 H(74)C(47)N(4)O(34) 1238.418446 H(74)C(47)N(4)O(34) O-linked glycosylation 1664 0.5 +Hex(2)HexNAc(3)NeuGc(1)@S 1240.434096 1241.1127 H(76)C(47)N(4)O(34) 1240.434096 H(76)C(47)N(4)O(34) O-linked glycosylation 1665 0.5 +Hex(2)HexNAc(3)NeuGc(1)@T 1240.434096 1241.1127 H(76)C(47)N(4)O(34) 1240.434096 H(76)C(47)N(4)O(34) O-linked glycosylation 1665 0.5 +dHex(1)Hex(2)HexNAc(2)NeuAc(1)Sulf(1)@T 1247.374532 1248.1252 H(73)C(45)N(3)O(35)S(1) 1247.374532 H(73)C(45)N(3)O(35)S(1) O-linked glycosylation 1666 0.5 +dHex(1)Hex(2)HexNAc(2)NeuAc(1)Sulf(1)@S 1247.374532 1248.1252 H(73)C(45)N(3)O(35)S(1) 1247.374532 H(73)C(45)N(3)O(35)S(1) O-linked glycosylation 1666 0.5 +dHex(3)Hex(1)HexNAc(2)Kdn(1)@T 1256.454163 1257.1519 H(80)C(49)N(2)O(35) 1256.454163 H(80)C(49)N(2)O(35) O-linked glycosylation 1667 0.5 +dHex(3)Hex(1)HexNAc(2)Kdn(1)@S 1256.454163 1257.1519 H(80)C(49)N(2)O(35) 1256.454163 H(80)C(49)N(2)O(35) O-linked glycosylation 1667 0.5 +dHex(2)Hex(3)HexNAc(2)Sulf(1)@T 1264.389848 1265.1524 H(76)C(46)N(2)O(36)S(1) 1264.389848 H(76)C(46)N(2)O(36)S(1) O-linked glycosylation 1668 0.5 +dHex(2)Hex(3)HexNAc(2)Sulf(1)@S 1264.389848 1265.1524 H(76)C(46)N(2)O(36)S(1) 1264.389848 H(76)C(46)N(2)O(36)S(1) O-linked glycosylation 1668 0.5 +dHex(2)Hex(2)HexNAc(2)Kdn(1)@T 1272.449077 1273.1513 H(80)C(49)N(2)O(36) 1272.449077 H(80)C(49)N(2)O(36) O-linked glycosylation 1669 0.5 +dHex(2)Hex(2)HexNAc(2)Kdn(1)@S 1272.449077 1273.1513 H(80)C(49)N(2)O(36) 1272.449077 H(80)C(49)N(2)O(36) O-linked glycosylation 1669 0.5 +dHex(2)Hex(2)HexA(1)HexNAc(2)Sulf(1)@T 1278.369113 1279.136 H(74)C(46)N(2)O(37)S(1) 1278.369113 H(74)C(46)N(2)O(37)S(1) O-linked glycosylation 1670 0.5 +dHex(2)Hex(2)HexA(1)HexNAc(2)Sulf(1)@S 1278.369113 1279.136 H(74)C(46)N(2)O(37)S(1) 1278.369113 H(74)C(46)N(2)O(37)S(1) O-linked glycosylation 1670 0.5 +dHex(1)Hex(2)HexNAc(4)@T 1282.481046 1283.1925 H(82)C(50)N(4)O(34) 1282.481046 H(82)C(50)N(4)O(34) O-linked glycosylation 1671 0.5 +dHex(1)Hex(2)HexNAc(4)@S 1282.481046 1283.1925 H(82)C(50)N(4)O(34) 1282.481046 H(82)C(50)N(4)O(34) O-linked glycosylation 1671 0.5 +dHex(1)Hex(2)HexNAc(4)@N 1282.481046 1283.1925 H(82)C(50)N(4)O(34) 1282.481046 H(82)C(50)N(4)O(34) N-linked glycosylation 1671 0.5 +Hex(1)HexNAc(1)NeuGc(3)@S 1286.40319 1287.0951 H(74)C(47)N(4)O(37) 1286.40319 H(74)C(47)N(4)O(37) O-linked glycosylation 1672 0.5 +Hex(1)HexNAc(1)NeuGc(3)@T 1286.40319 1287.0951 H(74)C(47)N(4)O(37) 1286.40319 H(74)C(47)N(4)O(37) O-linked glycosylation 1672 0.5 +dHex(1)Hex(1)HexNAc(3)NeuAc(1)Sulf(1)@T 1288.401081 1289.1771 H(76)C(47)N(4)O(35)S(1) 1288.401081 H(76)C(47)N(4)O(35)S(1) O-linked glycosylation 1673 0.5 +dHex(1)Hex(1)HexNAc(3)NeuAc(1)Sulf(1)@S 1288.401081 1289.1771 H(76)C(47)N(4)O(35)S(1) 1288.401081 H(76)C(47)N(4)O(35)S(1) O-linked glycosylation 1673 0.5 +dHex(1)Hex(3)HexA(1)HexNAc(2)Sulf(1)@T 1294.364027 1295.1354 H(74)C(46)N(2)O(38)S(1) 1294.364027 H(74)C(46)N(2)O(38)S(1) O-linked glycosylation 1674 0.5 +dHex(1)Hex(3)HexA(1)HexNAc(2)Sulf(1)@S 1294.364027 1295.1354 H(74)C(46)N(2)O(38)S(1) 1294.364027 H(74)C(46)N(2)O(38)S(1) O-linked glycosylation 1674 0.5 +dHex(1)Hex(1)HexNAc(2)NeuAc(2)@S 1296.460311 1297.176 H(80)C(50)N(4)O(35) 1296.460311 H(80)C(50)N(4)O(35) O-linked glycosylation 1675 0.5 +dHex(1)Hex(1)HexNAc(2)NeuAc(2)@T 1296.460311 1297.176 H(80)C(50)N(4)O(35) 1296.460311 H(80)C(50)N(4)O(35) O-linked glycosylation 1675 0.5 +dHex(3)HexNAc(3)Kdn(1)@T 1297.480712 1298.2038 H(83)C(51)N(3)O(35) 1297.480712 H(83)C(51)N(3)O(35) O-linked glycosylation 1676 0.5 +dHex(3)HexNAc(3)Kdn(1)@S 1297.480712 1298.2038 H(83)C(51)N(3)O(35) 1297.480712 H(83)C(51)N(3)O(35) O-linked glycosylation 1676 0.5 +Hex(2)HexNAc(3)NeuAc(1)Sulf(1)@T 1304.395996 1305.1765 H(76)C(47)N(4)O(36)S(1) 1304.395996 H(76)C(47)N(4)O(36)S(1) O-linked glycosylation 1678 0.5 +Hex(2)HexNAc(3)NeuAc(1)Sulf(1)@S 1304.395996 1305.1765 H(76)C(47)N(4)O(36)S(1) 1304.395996 H(76)C(47)N(4)O(36)S(1) O-linked glycosylation 1678 0.5 +dHex(2)Hex(2)HexNAc(3)Sulf(1)@T 1305.416397 1306.2044 H(79)C(48)N(3)O(36)S(1) 1305.416397 H(79)C(48)N(3)O(36)S(1) O-linked glycosylation 1679 0.5 +dHex(2)Hex(2)HexNAc(3)Sulf(1)@S 1305.416397 1306.2044 H(79)C(48)N(3)O(36)S(1) 1305.416397 H(79)C(48)N(3)O(36)S(1) O-linked glycosylation 1679 0.5 +dHex(2)HexNAc(5)@S 1307.512681 1308.245 H(85)C(52)N(5)O(33) 1307.512681 H(85)C(52)N(5)O(33) O-linked glycosylation 1680 0.5 +dHex(2)HexNAc(5)@T 1307.512681 1308.245 H(85)C(52)N(5)O(33) 1307.512681 H(85)C(52)N(5)O(33) O-linked glycosylation 1680 0.5 +Hex(2)HexNAc(2)NeuAc(2)@S 1312.455225 1313.1754 H(80)C(50)N(4)O(36) 1312.455225 H(80)C(50)N(4)O(36) O-linked glycosylation 1681 0.5 +Hex(2)HexNAc(2)NeuAc(2)@T 1312.455225 1313.1754 H(80)C(50)N(4)O(36) 1312.455225 H(80)C(50)N(4)O(36) O-linked glycosylation 1681 0.5 +dHex(2)Hex(2)HexNAc(2)NeuAc(1)@T 1313.475627 1314.2032 H(83)C(51)N(3)O(36) 1313.475627 H(83)C(51)N(3)O(36) O-linked glycosylation 1682 0.5 +dHex(2)Hex(2)HexNAc(2)NeuAc(1)@S 1313.475627 1314.2032 H(83)C(51)N(3)O(36) 1313.475627 H(83)C(51)N(3)O(36) O-linked glycosylation 1682 0.5 +dHex(1)Hex(3)HexNAc(3)Sulf(1)@T 1321.411312 1322.2038 H(79)C(48)N(3)O(37)S(1) 1321.411312 H(79)C(48)N(3)O(37)S(1) O-linked glycosylation 1683 0.5 +dHex(1)Hex(3)HexNAc(3)Sulf(1)@S 1321.411312 1322.2038 H(79)C(48)N(3)O(37)S(1) 1321.411312 H(79)C(48)N(3)O(37)S(1) O-linked glycosylation 1683 0.5 +dHex(2)Hex(2)HexNAc(2)NeuGc(1)@T 1329.470541 1330.2026 H(83)C(51)N(3)O(37) 1329.470541 H(83)C(51)N(3)O(37) O-linked glycosylation 1684 0.5 +dHex(2)Hex(2)HexNAc(2)NeuGc(1)@S 1329.470541 1330.2026 H(83)C(51)N(3)O(37) 1329.470541 H(83)C(51)N(3)O(37) O-linked glycosylation 1684 0.5 +Hex(2)HexNAc(5)@S 1339.50251 1340.2438 H(85)C(52)N(5)O(35) 1339.50251 H(85)C(52)N(5)O(35) O-linked glycosylation 1685 0.5 +Hex(2)HexNAc(5)@T 1339.50251 1340.2438 H(85)C(52)N(5)O(35) 1339.50251 H(85)C(52)N(5)O(35) O-linked glycosylation 1685 0.5 +dHex(1)Hex(3)HexNAc(2)NeuGc(1)@S 1345.465456 1346.202 H(83)C(51)N(3)O(38) 1345.465456 H(83)C(51)N(3)O(38) O-linked glycosylation 1686 0.5 +dHex(1)Hex(3)HexNAc(2)NeuGc(1)@T 1345.465456 1346.202 H(83)C(51)N(3)O(38) 1345.465456 H(83)C(51)N(3)O(38) O-linked glycosylation 1686 0.5 +Hex(1)HexNAc(3)NeuAc(2)@S 1353.481775 1354.2273 H(83)C(52)N(5)O(36) 1353.481775 H(83)C(52)N(5)O(36) O-linked glycosylation 1687 0.5 +Hex(1)HexNAc(3)NeuAc(2)@T 1353.481775 1354.2273 H(83)C(52)N(5)O(36) 1353.481775 H(83)C(52)N(5)O(36) O-linked glycosylation 1687 0.5 +dHex(1)Hex(2)HexNAc(3)NeuAc(1)@S 1370.49709 1371.2545 H(86)C(53)N(4)O(37) 1370.49709 H(86)C(53)N(4)O(37) O-linked glycosylation 1688 0.5 +dHex(1)Hex(2)HexNAc(3)NeuAc(1)@T 1370.49709 1371.2545 H(86)C(53)N(4)O(37) 1370.49709 H(86)C(53)N(4)O(37) O-linked glycosylation 1688 0.5 +dHex(3)Hex(2)HexNAc(3)@S 1371.517491 1372.2824 H(89)C(54)N(3)O(37) 1371.517491 H(89)C(54)N(3)O(37) O-linked glycosylation 1689 0.5 +dHex(3)Hex(2)HexNAc(3)@T 1371.517491 1372.2824 H(89)C(54)N(3)O(37) 1371.517491 H(89)C(54)N(3)O(37) O-linked glycosylation 1689 0.5 +Hex(7)Phos(3)@T 1374.268757 1374.9239 H(73)C(42)O(44)P(3) 1374.268757 H(73)C(42)O(44)P(3) O-linked glycosylation 1690 0.5 +Hex(7)Phos(3)@S 1374.268757 1374.9239 H(73)C(42)O(44)P(3) 1374.268757 H(73)C(42)O(44)P(3) O-linked glycosylation 1690 0.5 +dHex(1)Hex(4)HexA(1)HexNAc(2)@S 1376.460036 1377.2128 H(84)C(52)N(2)O(40) 1376.460036 H(84)C(52)N(2)O(40) O-linked glycosylation 1691 0.5 +dHex(1)Hex(4)HexA(1)HexNAc(2)@T 1376.460036 1377.2128 H(84)C(52)N(2)O(40) 1376.460036 H(84)C(52)N(2)O(40) O-linked glycosylation 1691 0.5 +Hex(3)HexNAc(3)NeuAc(1)@T 1386.492005 1387.2539 H(86)C(53)N(4)O(38) 1386.492005 H(86)C(53)N(4)O(38) O-linked glycosylation 1692 0.5 +Hex(3)HexNAc(3)NeuAc(1)@S 1386.492005 1387.2539 H(86)C(53)N(4)O(38) 1386.492005 H(86)C(53)N(4)O(38) O-linked glycosylation 1692 0.5 +dHex(1)Hex(3)HexA(2)HexNAc(2)@S 1390.439301 1391.1963 H(82)C(52)N(2)O(41) 1390.439301 H(82)C(52)N(2)O(41) O-linked glycosylation 1693 0.5 +dHex(1)Hex(3)HexA(2)HexNAc(2)@T 1390.439301 1391.1963 H(82)C(52)N(2)O(41) 1390.439301 H(82)C(52)N(2)O(41) O-linked glycosylation 1693 0.5 +Hex(2)HexNAc(2)NeuAc(2)Sulf(1)@T 1392.41204 1393.2386 H(80)C(50)N(4)O(39)S(1) 1392.41204 H(80)C(50)N(4)O(39)S(1) O-linked glycosylation 1694 0.5 +Hex(2)HexNAc(2)NeuAc(2)Sulf(1)@S 1392.41204 1393.2386 H(80)C(50)N(4)O(39)S(1) 1392.41204 H(80)C(50)N(4)O(39)S(1) O-linked glycosylation 1694 0.5 +dHex(2)Hex(2)HexNAc(2)NeuAc(1)Sulf(1)@T 1393.432441 1394.2664 H(83)C(51)N(3)O(39)S(1) 1393.432441 H(83)C(51)N(3)O(39)S(1) O-linked glycosylation 1695 0.5 +dHex(2)Hex(2)HexNAc(2)NeuAc(1)Sulf(1)@S 1393.432441 1394.2664 H(83)C(51)N(3)O(39)S(1) 1393.432441 H(83)C(51)N(3)O(39)S(1) O-linked glycosylation 1695 0.5 +Hex(3)HexNAc(3)NeuGc(1)@S 1402.48692 1403.2533 H(86)C(53)N(4)O(39) 1402.48692 H(86)C(53)N(4)O(39) O-linked glycosylation 1696 0.5 +Hex(3)HexNAc(3)NeuGc(1)@T 1402.48692 1403.2533 H(86)C(53)N(4)O(39) 1402.48692 H(86)C(53)N(4)O(39) O-linked glycosylation 1696 0.5 +dHex(4)Hex(1)HexNAc(2)Kdn(1)@T 1402.512072 1403.2931 H(90)C(55)N(2)O(39) 1402.512072 H(90)C(55)N(2)O(39) O-linked glycosylation 1697 0.5 +dHex(4)Hex(1)HexNAc(2)Kdn(1)@S 1402.512072 1403.2931 H(90)C(55)N(2)O(39) 1402.512072 H(90)C(55)N(2)O(39) O-linked glycosylation 1697 0.5 +dHex(3)Hex(2)HexNAc(2)Kdn(1)@T 1418.506986 1419.2925 H(90)C(55)N(2)O(40) 1418.506986 H(90)C(55)N(2)O(40) O-linked glycosylation 1698 0.5 +dHex(3)Hex(2)HexNAc(2)Kdn(1)@S 1418.506986 1419.2925 H(90)C(55)N(2)O(40) 1418.506986 H(90)C(55)N(2)O(40) O-linked glycosylation 1698 0.5 +dHex(3)Hex(2)HexA(1)HexNAc(2)Sulf(1)@T 1424.427021 1425.2772 H(84)C(52)N(2)O(41)S(1) 1424.427021 H(84)C(52)N(2)O(41)S(1) O-linked glycosylation 1699 0.5 +dHex(3)Hex(2)HexA(1)HexNAc(2)Sulf(1)@S 1424.427021 1425.2772 H(84)C(52)N(2)O(41)S(1) 1424.427021 H(84)C(52)N(2)O(41)S(1) O-linked glycosylation 1699 0.5 +Hex(2)HexNAc(4)NeuAc(1)@S 1427.518554 1428.3059 H(89)C(55)N(5)O(38) 1427.518554 H(89)C(55)N(5)O(38) O-linked glycosylation 1700 0.5 +Hex(2)HexNAc(4)NeuAc(1)@T 1427.518554 1428.3059 H(89)C(55)N(5)O(38) 1427.518554 H(89)C(55)N(5)O(38) O-linked glycosylation 1700 0.5 +dHex(2)Hex(2)HexNAc(4)@S 1428.538955 1429.3337 H(92)C(56)N(4)O(38) 1428.538955 H(92)C(56)N(4)O(38) O-linked glycosylation 1701 0.5 +dHex(2)Hex(2)HexNAc(4)@T 1428.538955 1429.3337 H(92)C(56)N(4)O(38) 1428.538955 H(92)C(56)N(4)O(38) O-linked glycosylation 1701 0.5 +dHex(2)Hex(3)HexA(1)HexNAc(2)Sulf(1)@T 1440.421936 1441.2766 H(84)C(52)N(2)O(42)S(1) 1440.421936 H(84)C(52)N(2)O(42)S(1) O-linked glycosylation 1702 0.5 +dHex(2)Hex(3)HexA(1)HexNAc(2)Sulf(1)@S 1440.421936 1441.2766 H(84)C(52)N(2)O(42)S(1) 1440.421936 H(84)C(52)N(2)O(42)S(1) O-linked glycosylation 1702 0.5 +dHex(4)HexNAc(3)Kdn(1)@T 1443.538621 1444.345 H(93)C(57)N(3)O(39) 1443.538621 H(93)C(57)N(3)O(39) O-linked glycosylation 1703 0.5 +dHex(4)HexNAc(3)Kdn(1)@S 1443.538621 1444.345 H(93)C(57)N(3)O(39) 1443.538621 H(93)C(57)N(3)O(39) O-linked glycosylation 1703 0.5 +Hex(2)HexNAc(1)NeuGc(3)@S 1448.456013 1449.2357 H(84)C(53)N(4)O(42) 1448.456013 H(84)C(53)N(4)O(42) O-linked glycosylation 1705 0.5 +Hex(2)HexNAc(1)NeuGc(3)@T 1448.456013 1449.2357 H(84)C(53)N(4)O(42) 1448.456013 H(84)C(53)N(4)O(42) O-linked glycosylation 1705 0.5 +dHex(4)Hex(1)HexNAc(1)Kdn(2)@T 1449.501567 1450.3032 H(91)C(56)N(1)O(42) 1449.501567 H(91)C(56)N(1)O(42) O-linked glycosylation 1706 0.5 +dHex(4)Hex(1)HexNAc(1)Kdn(2)@S 1449.501567 1450.3032 H(91)C(56)N(1)O(42) 1449.501567 H(91)C(56)N(1)O(42) O-linked glycosylation 1706 0.5 +dHex(1)Hex(2)HexNAc(3)NeuAc(1)Sulf(1)@T 1450.453905 1451.3177 H(86)C(53)N(4)O(40)S(1) 1450.453905 H(86)C(53)N(4)O(40)S(1) O-linked glycosylation 1707 0.5 +dHex(1)Hex(2)HexNAc(3)NeuAc(1)Sulf(1)@S 1450.453905 1451.3177 H(86)C(53)N(4)O(40)S(1) 1450.453905 H(86)C(53)N(4)O(40)S(1) O-linked glycosylation 1707 0.5 +dHex(1)Hex(2)HexNAc(2)NeuAc(2)@S 1458.513134 1459.3166 H(90)C(56)N(4)O(40) 1458.513134 H(90)C(56)N(4)O(40) O-linked glycosylation 1708 0.5 +dHex(1)Hex(2)HexNAc(2)NeuAc(2)@T 1458.513134 1459.3166 H(90)C(56)N(4)O(40) 1458.513134 H(90)C(56)N(4)O(40) O-linked glycosylation 1708 0.5 +dHex(3)Hex(1)HexNAc(3)Kdn(1)@T 1459.533535 1460.3444 H(93)C(57)N(3)O(40) 1459.533535 H(93)C(57)N(3)O(40) O-linked glycosylation 1709 0.5 +dHex(3)Hex(1)HexNAc(3)Kdn(1)@S 1459.533535 1460.3444 H(93)C(57)N(3)O(40) 1459.533535 H(93)C(57)N(3)O(40) O-linked glycosylation 1709 0.5 +Hex(3)HexNAc(3)NeuAc(1)Sulf(1)@T 1466.44882 1467.3171 H(86)C(53)N(4)O(41)S(1) 1466.44882 H(86)C(53)N(4)O(41)S(1) O-linked glycosylation 1711 0.5 +Hex(3)HexNAc(3)NeuAc(1)Sulf(1)@S 1466.44882 1467.3171 H(86)C(53)N(4)O(41)S(1) 1466.44882 H(86)C(53)N(4)O(41)S(1) O-linked glycosylation 1711 0.5 +Hex(3)HexNAc(2)NeuAc(2)@S 1474.508049 1475.316 H(90)C(56)N(4)O(41) 1474.508049 H(90)C(56)N(4)O(41) O-linked glycosylation 1712 0.5 +Hex(3)HexNAc(2)NeuAc(2)@T 1474.508049 1475.316 H(90)C(56)N(4)O(41) 1474.508049 H(90)C(56)N(4)O(41) O-linked glycosylation 1712 0.5 +Hex(3)HexNAc(3)NeuGc(1)Sulf(1)@T 1482.443734 1483.3165 H(86)C(53)N(4)O(42)S(1) 1482.443734 H(86)C(53)N(4)O(42)S(1) O-linked glycosylation 1713 0.5 +Hex(3)HexNAc(3)NeuGc(1)Sulf(1)@S 1482.443734 1483.3165 H(86)C(53)N(4)O(42)S(1) 1482.443734 H(86)C(53)N(4)O(42)S(1) O-linked glycosylation 1713 0.5 +dHex(1)Hex(2)HexNAc(2)NeuGc(2)@S 1490.502964 1491.3154 H(90)C(56)N(4)O(42) 1490.502964 H(90)C(56)N(4)O(42) O-linked glycosylation 1714 0.5 +dHex(1)Hex(2)HexNAc(2)NeuGc(2)@T 1490.502964 1491.3154 H(90)C(56)N(4)O(42) 1490.502964 H(90)C(56)N(4)O(42) O-linked glycosylation 1714 0.5 +dHex(2)Hex(3)HexNAc(2)NeuGc(1)@T 1491.523365 1492.3432 H(93)C(57)N(3)O(42) 1491.523365 H(93)C(57)N(3)O(42) O-linked glycosylation 1715 0.5 +dHex(2)Hex(3)HexNAc(2)NeuGc(1)@S 1491.523365 1492.3432 H(93)C(57)N(3)O(42) 1491.523365 H(93)C(57)N(3)O(42) O-linked glycosylation 1715 0.5 +dHex(1)Hex(3)HexA(1)HexNAc(3)Sulf(1)@T 1497.4434 1498.3279 H(87)C(54)N(3)O(43)S(1) 1497.4434 H(87)C(54)N(3)O(43)S(1) O-linked glycosylation 1716 0.5 +dHex(1)Hex(3)HexA(1)HexNAc(3)Sulf(1)@S 1497.4434 1498.3279 H(87)C(54)N(3)O(43)S(1) 1497.4434 H(87)C(54)N(3)O(43)S(1) O-linked glycosylation 1716 0.5 +Hex(2)HexNAc(3)NeuAc(2)@S 1515.534598 1516.3679 H(93)C(58)N(5)O(41) 1515.534598 H(93)C(58)N(5)O(41) O-linked glycosylation 1717 0.5 +Hex(2)HexNAc(3)NeuAc(2)@T 1515.534598 1516.3679 H(93)C(58)N(5)O(41) 1515.534598 H(93)C(58)N(5)O(41) O-linked glycosylation 1717 0.5 +dHex(2)Hex(2)HexNAc(3)NeuAc(1)@S 1516.554999 1517.3957 H(96)C(59)N(4)O(41) 1516.554999 H(96)C(59)N(4)O(41) O-linked glycosylation 1718 0.5 +dHex(2)Hex(2)HexNAc(3)NeuAc(1)@T 1516.554999 1517.3957 H(96)C(59)N(4)O(41) 1516.554999 H(96)C(59)N(4)O(41) O-linked glycosylation 1718 0.5 +dHex(4)Hex(2)HexNAc(3)@S 1517.5754 1518.4236 H(99)C(60)N(3)O(41) 1517.5754 H(99)C(60)N(3)O(41) O-linked glycosylation 1719 0.5 +dHex(4)Hex(2)HexNAc(3)@T 1517.5754 1518.4236 H(99)C(60)N(3)O(41) 1517.5754 H(99)C(60)N(3)O(41) O-linked glycosylation 1719 0.5 +Hex(2)HexNAc(3)NeuAc(1)NeuGc(1)@S 1531.529513 1532.3673 H(93)C(58)N(5)O(42) 1531.529513 H(93)C(58)N(5)O(42) O-linked glycosylation 1720 0.5 +Hex(2)HexNAc(3)NeuAc(1)NeuGc(1)@T 1531.529513 1532.3673 H(93)C(58)N(5)O(42) 1531.529513 H(93)C(58)N(5)O(42) O-linked glycosylation 1720 0.5 +dHex(2)Hex(2)HexNAc(3)NeuGc(1)@T 1532.549914 1533.3951 H(96)C(59)N(4)O(42) 1532.549914 H(96)C(59)N(4)O(42) O-linked glycosylation 1721 0.5 +dHex(2)Hex(2)HexNAc(3)NeuGc(1)@S 1532.549914 1533.3951 H(96)C(59)N(4)O(42) 1532.549914 H(96)C(59)N(4)O(42) O-linked glycosylation 1721 0.5 +dHex(3)Hex(3)HexNAc(3)@S 1533.570315 1534.423 H(99)C(60)N(3)O(42) 1533.570315 H(99)C(60)N(3)O(42) O-linked glycosylation 1722 0.5 +dHex(3)Hex(3)HexNAc(3)@T 1533.570315 1534.423 H(99)C(60)N(3)O(42) 1533.570315 H(99)C(60)N(3)O(42) O-linked glycosylation 1722 0.5 +Hex(8)Phos(3)@T 1536.321581 1537.0645 H(83)C(48)O(49)P(3) 1536.321581 H(83)C(48)O(49)P(3) O-linked glycosylation 1723 0.5 +Hex(8)Phos(3)@S 1536.321581 1537.0645 H(83)C(48)O(49)P(3) 1536.321581 H(83)C(48)O(49)P(3) O-linked glycosylation 1723 0.5 +dHex(1)Hex(2)HexNAc(2)NeuAc(2)Sulf(1)@T 1538.469949 1539.3798 H(90)C(56)N(4)O(43)S(1) 1538.469949 H(90)C(56)N(4)O(43)S(1) O-linked glycosylation 1724 0.5 +dHex(1)Hex(2)HexNAc(2)NeuAc(2)Sulf(1)@S 1538.469949 1539.3798 H(90)C(56)N(4)O(43)S(1) 1538.469949 H(90)C(56)N(4)O(43)S(1) O-linked glycosylation 1724 0.5 +Hex(2)HexNAc(3)NeuGc(2)@S 1547.524427 1548.3667 H(93)C(58)N(5)O(43) 1547.524427 H(93)C(58)N(5)O(43) O-linked glycosylation 1725 0.5 +Hex(2)HexNAc(3)NeuGc(2)@T 1547.524427 1548.3667 H(93)C(58)N(5)O(43) 1547.524427 H(93)C(58)N(5)O(43) O-linked glycosylation 1725 0.5 +dHex(4)Hex(2)HexNAc(2)Kdn(1)@T 1564.564895 1565.4337 H(100)C(61)N(2)O(44) 1564.564895 H(100)C(61)N(2)O(44) O-linked glycosylation 1726 0.5 +dHex(4)Hex(2)HexNAc(2)Kdn(1)@S 1564.564895 1565.4337 H(100)C(61)N(2)O(44) 1564.564895 H(100)C(61)N(2)O(44) O-linked glycosylation 1726 0.5 +dHex(1)Hex(2)HexNAc(4)NeuAc(1)@S 1573.576463 1574.4471 H(99)C(61)N(5)O(42) 1573.576463 H(99)C(61)N(5)O(42) O-linked glycosylation 1727 0.5 +dHex(1)Hex(2)HexNAc(4)NeuAc(1)@T 1573.576463 1574.4471 H(99)C(61)N(5)O(42) 1573.576463 H(99)C(61)N(5)O(42) O-linked glycosylation 1727 0.5 +dHex(3)Hex(2)HexNAc(4)@S 1574.596864 1575.4749 H(102)C(62)N(4)O(42) 1574.596864 H(102)C(62)N(4)O(42) O-linked glycosylation 1728 0.5 +dHex(3)Hex(2)HexNAc(4)@T 1574.596864 1575.4749 H(102)C(62)N(4)O(42) 1574.596864 H(102)C(62)N(4)O(42) O-linked glycosylation 1728 0.5 +Hex(1)HexNAc(1)NeuGc(4)@S 1593.493521 1594.349 H(91)C(58)N(5)O(46) 1593.493521 H(91)C(58)N(5)O(46) O-linked glycosylation 1729 0.5 +Hex(1)HexNAc(1)NeuGc(4)@T 1593.493521 1594.349 H(91)C(58)N(5)O(46) 1593.493521 H(91)C(58)N(5)O(46) O-linked glycosylation 1729 0.5 +dHex(4)Hex(1)HexNAc(3)Kdn(1)@T 1605.591444 1606.4856 H(103)C(63)N(3)O(44) 1605.591444 H(103)C(63)N(3)O(44) O-linked glycosylation 1730 0.5 +dHex(4)Hex(1)HexNAc(3)Kdn(1)@S 1605.591444 1606.4856 H(103)C(63)N(3)O(44) 1605.591444 H(103)C(63)N(3)O(44) O-linked glycosylation 1730 0.5 +Hex(4)HexNAc(4)Sulf(2)@T 1620.442414 1621.4589 H(92)C(56)N(4)O(46)S(2) 1620.442414 H(92)C(56)N(4)O(46)S(2) O-linked glycosylation 1732 0.5 +Hex(4)HexNAc(4)Sulf(2)@S 1620.442414 1621.4589 H(92)C(56)N(4)O(46)S(2) 1620.442414 H(92)C(56)N(4)O(46)S(2) O-linked glycosylation 1732 0.5 +dHex(3)Hex(2)HexNAc(3)Kdn(1)@T 1621.586359 1622.485 H(103)C(63)N(3)O(45) 1621.586359 H(103)C(63)N(3)O(45) O-linked glycosylation 1733 0.5 +dHex(3)Hex(2)HexNAc(3)Kdn(1)@S 1621.586359 1622.485 H(103)C(63)N(3)O(45) 1621.586359 H(103)C(63)N(3)O(45) O-linked glycosylation 1733 0.5 +dHex(2)Hex(2)HexNAc(5)@S 1631.618328 1632.5262 H(105)C(64)N(5)O(43) 1631.618328 H(105)C(64)N(5)O(43) O-linked glycosylation 1735 0.5 +dHex(2)Hex(2)HexNAc(5)@T 1631.618328 1632.5262 H(105)C(64)N(5)O(43) 1631.618328 H(105)C(64)N(5)O(43) O-linked glycosylation 1735 0.5 +dHex(2)Hex(3)HexA(1)HexNAc(3)Sulf(1)@T 1643.501309 1644.4691 H(97)C(60)N(3)O(47)S(1) 1643.501309 H(97)C(60)N(3)O(47)S(1) O-linked glycosylation 1736 0.5 +dHex(2)Hex(3)HexA(1)HexNAc(3)Sulf(1)@S 1643.501309 1644.4691 H(97)C(60)N(3)O(47)S(1) 1643.501309 H(97)C(60)N(3)O(47)S(1) O-linked glycosylation 1736 0.5 +dHex(1)Hex(4)HexA(1)HexNAc(3)Sulf(1)@T 1659.496223 1660.4685 H(97)C(60)N(3)O(48)S(1) 1659.496223 H(97)C(60)N(3)O(48)S(1) O-linked glycosylation 1737 0.5 +dHex(1)Hex(4)HexA(1)HexNAc(3)Sulf(1)@S 1659.496223 1660.4685 H(97)C(60)N(3)O(48)S(1) 1659.496223 H(97)C(60)N(3)O(48)S(1) O-linked glycosylation 1737 0.5 +Hex(3)HexNAc(3)NeuAc(2)@S 1677.587422 1678.5085 H(103)C(64)N(5)O(46) 1677.587422 H(103)C(64)N(5)O(46) O-linked glycosylation 1738 0.5 +Hex(3)HexNAc(3)NeuAc(2)@T 1677.587422 1678.5085 H(103)C(64)N(5)O(46) 1677.587422 H(103)C(64)N(5)O(46) O-linked glycosylation 1738 0.5 +dHex(2)Hex(3)HexNAc(3)NeuAc(1)@T 1678.607823 1679.5363 H(106)C(65)N(4)O(46) 1678.607823 H(106)C(65)N(4)O(46) O-linked glycosylation 1739 0.5 +dHex(2)Hex(3)HexNAc(3)NeuAc(1)@S 1678.607823 1679.5363 H(106)C(65)N(4)O(46) 1678.607823 H(106)C(65)N(4)O(46) O-linked glycosylation 1739 0.5 +dHex(4)Hex(3)HexNAc(3)@S 1679.628224 1680.5642 H(109)C(66)N(3)O(46) 1679.628224 H(109)C(66)N(3)O(46) O-linked glycosylation 1740 0.5 +dHex(4)Hex(3)HexNAc(3)@T 1679.628224 1680.5642 H(109)C(66)N(3)O(46) 1679.628224 H(109)C(66)N(3)O(46) O-linked glycosylation 1740 0.5 +Hex(9)Phos(3)@T 1698.374404 1699.2051 H(93)C(54)O(54)P(3) 1698.374404 H(93)C(54)O(54)P(3) O-linked glycosylation 1742 0.5 +Hex(9)Phos(3)@S 1698.374404 1699.2051 H(93)C(54)O(54)P(3) 1698.374404 H(93)C(54)O(54)P(3) O-linked glycosylation 1742 0.5 +dHex(2)HexNAc(7)@S 1713.671426 1714.63 H(111)C(68)N(7)O(43) 1713.671426 H(111)C(68)N(7)O(43) O-linked glycosylation 1743 0.5 +dHex(2)HexNAc(7)@T 1713.671426 1714.63 H(111)C(68)N(7)O(43) 1713.671426 H(111)C(68)N(7)O(43) O-linked glycosylation 1743 0.5 +Hex(2)HexNAc(1)NeuGc(4)@S 1755.546345 1756.4896 H(101)C(64)N(5)O(51) 1755.546345 H(101)C(64)N(5)O(51) O-linked glycosylation 1744 0.5 +Hex(2)HexNAc(1)NeuGc(4)@T 1755.546345 1756.4896 H(101)C(64)N(5)O(51) 1755.546345 H(101)C(64)N(5)O(51) O-linked glycosylation 1744 0.5 +Hex(3)HexNAc(3)NeuAc(2)Sulf(1)@T 1757.544236 1758.5717 H(103)C(64)N(5)O(49)S(1) 1757.544236 H(103)C(64)N(5)O(49)S(1) O-linked glycosylation 1745 0.5 +Hex(3)HexNAc(3)NeuAc(2)Sulf(1)@S 1757.544236 1758.5717 H(103)C(64)N(5)O(49)S(1) 1757.544236 H(103)C(64)N(5)O(49)S(1) O-linked glycosylation 1745 0.5 +dHex(2)Hex(3)HexNAc(5)@T 1793.671151 1794.6668 H(115)C(70)N(5)O(48) 1793.671151 H(115)C(70)N(5)O(48) O-linked glycosylation 1746 0.5 +dHex(2)Hex(3)HexNAc(5)@S 1793.671151 1794.6668 H(115)C(70)N(5)O(48) 1793.671151 H(115)C(70)N(5)O(48) O-linked glycosylation 1746 0.5 +dHex(2)Hex(3)HexNAc(5)@N 1793.671151 1794.6668 H(115)C(70)N(5)O(48) 1793.671151 H(115)C(70)N(5)O(48) N-linked glycosylation 1746 0.5 +dHex(1)Hex(2)HexNAc(2)NeuGc(3)@S 1797.593295 1798.5694 H(107)C(67)N(5)O(51) 1797.593295 H(107)C(67)N(5)O(51) O-linked glycosylation 1747 0.5 +dHex(1)Hex(2)HexNAc(2)NeuGc(3)@T 1797.593295 1798.5694 H(107)C(67)N(5)O(51) 1797.593295 H(107)C(67)N(5)O(51) O-linked glycosylation 1747 0.5 +dHex(2)Hex(4)HexA(1)HexNAc(3)Sulf(1)@T 1805.554132 1806.6097 H(107)C(66)N(3)O(52)S(1) 1805.554132 H(107)C(66)N(3)O(52)S(1) O-linked glycosylation 1748 0.5 +dHex(2)Hex(4)HexA(1)HexNAc(3)Sulf(1)@S 1805.554132 1806.6097 H(107)C(66)N(3)O(52)S(1) 1805.554132 H(107)C(66)N(3)O(52)S(1) O-linked glycosylation 1748 0.5 +Hex(2)HexNAc(3)NeuAc(3)@S 1806.630015 1807.6225 H(110)C(69)N(6)O(49) 1806.630015 H(110)C(69)N(6)O(49) O-linked glycosylation 1749 0.5 +Hex(2)HexNAc(3)NeuAc(3)@T 1806.630015 1807.6225 H(110)C(69)N(6)O(49) 1806.630015 H(110)C(69)N(6)O(49) O-linked glycosylation 1749 0.5 +dHex(1)Hex(3)HexNAc(3)NeuAc(2)@S 1823.64533 1824.6497 H(113)C(70)N(5)O(50) 1823.64533 H(113)C(70)N(5)O(50) O-linked glycosylation 1750 0.5 +dHex(1)Hex(3)HexNAc(3)NeuAc(2)@T 1823.64533 1824.6497 H(113)C(70)N(5)O(50) 1823.64533 H(113)C(70)N(5)O(50) O-linked glycosylation 1750 0.5 +dHex(3)Hex(3)HexNAc(3)NeuAc(1)@S 1824.665732 1825.6775 H(116)C(71)N(4)O(50) 1824.665732 H(116)C(71)N(4)O(50) O-linked glycosylation 1751 0.5 +dHex(3)Hex(3)HexNAc(3)NeuAc(1)@T 1824.665732 1825.6775 H(116)C(71)N(4)O(50) 1824.665732 H(116)C(71)N(4)O(50) O-linked glycosylation 1751 0.5 +Hex(2)HexNAc(3)NeuGc(3)@S 1854.614759 1855.6207 H(110)C(69)N(6)O(52) 1854.614759 H(110)C(69)N(6)O(52) O-linked glycosylation 1752 0.5 +Hex(2)HexNAc(3)NeuGc(3)@T 1854.614759 1855.6207 H(110)C(69)N(6)O(52) 1854.614759 H(110)C(69)N(6)O(52) O-linked glycosylation 1752 0.5 +Hex(10)Phos(3)@T 1860.427228 1861.3457 H(103)C(60)O(59)P(3) 1860.427228 H(103)C(60)O(59)P(3) O-linked glycosylation 1753 0.5 +Hex(10)Phos(3)@S 1860.427228 1861.3457 H(103)C(60)O(59)P(3) 1860.427228 H(103)C(60)O(59)P(3) O-linked glycosylation 1753 0.5 +dHex(1)Hex(2)HexNAc(4)NeuAc(2)@S 1864.67188 1865.7016 H(116)C(72)N(6)O(50) 1864.67188 H(116)C(72)N(6)O(50) O-linked glycosylation 1754 0.5 +dHex(1)Hex(2)HexNAc(4)NeuAc(2)@T 1864.67188 1865.7016 H(116)C(72)N(6)O(50) 1864.67188 H(116)C(72)N(6)O(50) O-linked glycosylation 1754 0.5 +Hex(1)HexNAc(1)NeuGc(5)@S 1900.583852 1901.603 H(108)C(69)N(6)O(55) 1900.583852 H(108)C(69)N(6)O(55) O-linked glycosylation 1755 0.5 +Hex(1)HexNAc(1)NeuGc(5)@T 1900.583852 1901.603 H(108)C(69)N(6)O(55) 1900.583852 H(108)C(69)N(6)O(55) O-linked glycosylation 1755 0.5 +Hex(4)HexNAc(4)NeuAc(1)Sulf(2)@T 1911.53783 1912.7135 H(109)C(67)N(5)O(54)S(2) 1911.53783 H(109)C(67)N(5)O(54)S(2) O-linked glycosylation 1756 0.5 +Hex(4)HexNAc(4)NeuAc(1)Sulf(2)@S 1911.53783 1912.7135 H(109)C(67)N(5)O(54)S(2) 1911.53783 H(109)C(67)N(5)O(54)S(2) O-linked glycosylation 1756 0.5 +Hex(4)HexNAc(4)NeuGc(1)Sulf(2)@T 1927.532745 1928.7129 H(109)C(67)N(5)O(55)S(2) 1927.532745 H(109)C(67)N(5)O(55)S(2) O-linked glycosylation 1757 0.5 +Hex(4)HexNAc(4)NeuGc(1)Sulf(2)@S 1927.532745 1928.7129 H(109)C(67)N(5)O(55)S(2) 1927.532745 H(109)C(67)N(5)O(55)S(2) O-linked glycosylation 1757 0.5 +dHex(2)Hex(3)HexNAc(3)NeuAc(2)@S 1969.703239 1970.7909 H(123)C(76)N(5)O(54) 1969.703239 H(123)C(76)N(5)O(54) O-linked glycosylation 1758 0.5 +dHex(2)Hex(3)HexNAc(3)NeuAc(2)@T 1969.703239 1970.7909 H(123)C(76)N(5)O(54) 1969.703239 H(123)C(76)N(5)O(54) O-linked glycosylation 1758 0.5 +Hex(4)HexNAc(4)NeuAc(1)Sulf(3)@T 1991.494645 1992.7767 H(109)C(67)N(5)O(57)S(3) 1991.494645 H(109)C(67)N(5)O(57)S(3) O-linked glycosylation 1759 0.5 +Hex(4)HexNAc(4)NeuAc(1)Sulf(3)@S 1991.494645 1992.7767 H(109)C(67)N(5)O(57)S(3) 1991.494645 H(109)C(67)N(5)O(57)S(3) O-linked glycosylation 1759 0.5 +dHex(2)Hex(2)HexNAc(2)@S 1022.38021 1022.9486 H(66)C(40)N(2)O(28) 1022.38021 H(66)C(40)N(2)O(28) O-linked glycosylation 1760 0.5 +dHex(2)Hex(2)HexNAc(2)@T 1022.38021 1022.9486 H(66)C(40)N(2)O(28) 1022.38021 H(66)C(40)N(2)O(28) O-linked glycosylation 1760 0.5 +dHex(2)Hex(2)HexNAc(2)@N 1022.38021 1022.9486 H(66)C(40)N(2)O(28) 1022.38021 H(66)C(40)N(2)O(28) N-linked glycosylation 1760 0.5 +dHex(1)Hex(3)HexNAc(2)@S 1038.375125 1038.948 H(66)C(40)N(2)O(29) 1038.375125 H(66)C(40)N(2)O(29) O-linked glycosylation 1761 0.5 +dHex(1)Hex(3)HexNAc(2)@T 1038.375125 1038.948 H(66)C(40)N(2)O(29) 1038.375125 H(66)C(40)N(2)O(29) O-linked glycosylation 1761 0.5 +dHex(1)Hex(3)HexNAc(2)@N 1038.375125 1038.948 H(66)C(40)N(2)O(29) 1038.375125 H(66)C(40)N(2)O(29) N-linked glycosylation 1761 0.5 +dHex(1)Hex(2)HexNAc(3)@S 1079.401674 1080.0 H(69)C(42)N(3)O(29) 1079.401674 H(69)C(42)N(3)O(29) O-linked glycosylation 1762 0.5 +dHex(1)Hex(2)HexNAc(3)@T 1079.401674 1080.0 H(69)C(42)N(3)O(29) 1079.401674 H(69)C(42)N(3)O(29) O-linked glycosylation 1762 0.5 +dHex(1)Hex(2)HexNAc(3)@N 1079.401674 1080.0 H(69)C(42)N(3)O(29) 1079.401674 H(69)C(42)N(3)O(29) N-linked glycosylation 1762 0.5 +Hex(3)HexNAc(3)@S 1095.396588 1095.9994 H(69)C(42)N(3)O(30) 1095.396588 H(69)C(42)N(3)O(30) O-linked glycosylation 1763 0.5 +Hex(3)HexNAc(3)@T 1095.396588 1095.9994 H(69)C(42)N(3)O(30) 1095.396588 H(69)C(42)N(3)O(30) O-linked glycosylation 1763 0.5 +Hex(3)HexNAc(3)@N 1095.396588 1095.9994 H(69)C(42)N(3)O(30) 1095.396588 H(69)C(42)N(3)O(30) N-linked glycosylation 1763 0.5 +dHex(1)Hex(3)HexNAc(2)Sulf(1)@N 1118.331939 1119.0112 H(66)C(40)N(2)O(32)S(1) 1118.331939 H(66)C(40)N(2)O(32)S(1) N-linked glycosylation 1764 0.5 +dHex(1)Hex(3)HexNAc(2)Sulf(1)@T 1118.331939 1119.0112 H(66)C(40)N(2)O(32)S(1) 1118.331939 H(66)C(40)N(2)O(32)S(1) O-linked glycosylation 1764 0.5 +dHex(1)Hex(3)HexNAc(2)Sulf(1)@S 1118.331939 1119.0112 H(66)C(40)N(2)O(32)S(1) 1118.331939 H(66)C(40)N(2)O(32)S(1) O-linked glycosylation 1764 0.5 +dHex(2)Hex(3)HexNAc(2)@S 1184.433033 1185.0892 H(76)C(46)N(2)O(33) 1184.433033 H(76)C(46)N(2)O(33) O-linked glycosylation 1765 0.5 +dHex(2)Hex(3)HexNAc(2)@T 1184.433033 1185.0892 H(76)C(46)N(2)O(33) 1184.433033 H(76)C(46)N(2)O(33) O-linked glycosylation 1765 0.5 +dHex(2)Hex(3)HexNAc(2)@N 1184.433033 1185.0892 H(76)C(46)N(2)O(33) 1184.433033 H(76)C(46)N(2)O(33) N-linked glycosylation 1765 0.5 +dHex(1)Hex(4)HexNAc(2)@S 1200.427948 1201.0886 H(76)C(46)N(2)O(34) 1200.427948 H(76)C(46)N(2)O(34) O-linked glycosylation 1766 0.5 +dHex(1)Hex(4)HexNAc(2)@T 1200.427948 1201.0886 H(76)C(46)N(2)O(34) 1200.427948 H(76)C(46)N(2)O(34) O-linked glycosylation 1766 0.5 +dHex(1)Hex(4)HexNAc(2)@N 1200.427948 1201.0886 H(76)C(46)N(2)O(34) 1200.427948 H(76)C(46)N(2)O(34) N-linked glycosylation 1766 0.5 +dHex(2)Hex(2)HexNAc(3)@S 1225.459583 1226.1412 H(79)C(48)N(3)O(33) 1225.459583 H(79)C(48)N(3)O(33) O-linked glycosylation 1767 0.5 +dHex(2)Hex(2)HexNAc(3)@T 1225.459583 1226.1412 H(79)C(48)N(3)O(33) 1225.459583 H(79)C(48)N(3)O(33) O-linked glycosylation 1767 0.5 +dHex(2)Hex(2)HexNAc(3)@N 1225.459583 1226.1412 H(79)C(48)N(3)O(33) 1225.459583 H(79)C(48)N(3)O(33) N-linked glycosylation 1767 0.5 +dHex(1)Hex(3)HexNAc(3)@S 1241.454497 1242.1406 H(79)C(48)N(3)O(34) 1241.454497 H(79)C(48)N(3)O(34) O-linked glycosylation 1768 0.5 +dHex(1)Hex(3)HexNAc(3)@T 1241.454497 1242.1406 H(79)C(48)N(3)O(34) 1241.454497 H(79)C(48)N(3)O(34) O-linked glycosylation 1768 0.5 +dHex(1)Hex(3)HexNAc(3)@N 1241.454497 1242.1406 H(79)C(48)N(3)O(34) 1241.454497 H(79)C(48)N(3)O(34) N-linked glycosylation 1768 0.5 +Hex(4)HexNAc(3)@S 1257.449412 1258.14 H(79)C(48)N(3)O(35) 1257.449412 H(79)C(48)N(3)O(35) O-linked glycosylation 1769 0.5 +Hex(4)HexNAc(3)@T 1257.449412 1258.14 H(79)C(48)N(3)O(35) 1257.449412 H(79)C(48)N(3)O(35) O-linked glycosylation 1769 0.5 +Hex(4)HexNAc(3)@N 1257.449412 1258.14 H(79)C(48)N(3)O(35) 1257.449412 H(79)C(48)N(3)O(35) N-linked glycosylation 1769 0.5 +dHex(2)Hex(4)HexNAc(2)@S 1346.485857 1347.2298 H(86)C(52)N(2)O(38) 1346.485857 H(86)C(52)N(2)O(38) O-linked glycosylation 1770 0.5 +dHex(2)Hex(4)HexNAc(2)@T 1346.485857 1347.2298 H(86)C(52)N(2)O(38) 1346.485857 H(86)C(52)N(2)O(38) O-linked glycosylation 1770 0.5 +dHex(2)Hex(4)HexNAc(2)@N 1346.485857 1347.2298 H(86)C(52)N(2)O(38) 1346.485857 H(86)C(52)N(2)O(38) N-linked glycosylation 1770 0.5 +dHex(2)Hex(3)HexNAc(3)@S 1387.512406 1388.2818 H(89)C(54)N(3)O(38) 1387.512406 H(89)C(54)N(3)O(38) O-linked glycosylation 1771 0.5 +dHex(2)Hex(3)HexNAc(3)@T 1387.512406 1388.2818 H(89)C(54)N(3)O(38) 1387.512406 H(89)C(54)N(3)O(38) O-linked glycosylation 1771 0.5 +dHex(2)Hex(3)HexNAc(3)@N 1387.512406 1388.2818 H(89)C(54)N(3)O(38) 1387.512406 H(89)C(54)N(3)O(38) N-linked glycosylation 1771 0.5 +Hex(3)HexNAc(5)@N 1501.555334 1502.3844 H(95)C(58)N(5)O(40) 1501.555334 H(95)C(58)N(5)O(40) N-linked glycosylation 1772 0.5 +Hex(3)HexNAc(5)@T 1501.555334 1502.3844 H(95)C(58)N(5)O(40) 1501.555334 H(95)C(58)N(5)O(40) O-linked glycosylation 1772 0.5 +Hex(3)HexNAc(5)@S 1501.555334 1502.3844 H(95)C(58)N(5)O(40) 1501.555334 H(95)C(58)N(5)O(40) O-linked glycosylation 1772 0.5 +Hex(4)HexNAc(3)NeuAc(1)@N 1548.544828 1549.3945 H(96)C(59)N(4)O(43) 1548.544828 H(96)C(59)N(4)O(43) N-linked glycosylation 1773 0.5 +Hex(4)HexNAc(3)NeuAc(1)@T 1548.544828 1549.3945 H(96)C(59)N(4)O(43) 1548.544828 H(96)C(59)N(4)O(43) O-linked glycosylation 1773 0.5 +Hex(4)HexNAc(3)NeuAc(1)@S 1548.544828 1549.3945 H(96)C(59)N(4)O(43) 1548.544828 H(96)C(59)N(4)O(43) O-linked glycosylation 1773 0.5 +dHex(2)Hex(3)HexNAc(4)@S 1590.591779 1591.4743 H(102)C(62)N(4)O(43) 1590.591779 H(102)C(62)N(4)O(43) O-linked glycosylation 1774 0.5 +dHex(2)Hex(3)HexNAc(4)@T 1590.591779 1591.4743 H(102)C(62)N(4)O(43) 1590.591779 H(102)C(62)N(4)O(43) O-linked glycosylation 1774 0.5 +dHex(2)Hex(3)HexNAc(4)@N 1590.591779 1591.4743 H(102)C(62)N(4)O(43) 1590.591779 H(102)C(62)N(4)O(43) N-linked glycosylation 1774 0.5 +dHex(1)Hex(3)HexNAc(5)@S 1647.613242 1648.5256 H(105)C(64)N(5)O(44) 1647.613242 H(105)C(64)N(5)O(44) O-linked glycosylation 1775 0.5 +dHex(1)Hex(3)HexNAc(5)@T 1647.613242 1648.5256 H(105)C(64)N(5)O(44) 1647.613242 H(105)C(64)N(5)O(44) O-linked glycosylation 1775 0.5 +dHex(1)Hex(3)HexNAc(5)@N 1647.613242 1648.5256 H(105)C(64)N(5)O(44) 1647.613242 H(105)C(64)N(5)O(44) N-linked glycosylation 1775 0.5 +Hex(3)HexNAc(6)@N 1704.634706 1705.5769 H(108)C(66)N(6)O(45) 1704.634706 H(108)C(66)N(6)O(45) N-linked glycosylation 1776 0.5 +Hex(3)HexNAc(6)@T 1704.634706 1705.5769 H(108)C(66)N(6)O(45) 1704.634706 H(108)C(66)N(6)O(45) O-linked glycosylation 1776 0.5 +Hex(3)HexNAc(6)@S 1704.634706 1705.5769 H(108)C(66)N(6)O(45) 1704.634706 H(108)C(66)N(6)O(45) O-linked glycosylation 1776 0.5 +Hex(4)HexNAc(4)NeuAc(1)@S 1751.624201 1752.5871 H(109)C(67)N(5)O(48) 1751.624201 H(109)C(67)N(5)O(48) O-linked glycosylation 1777 0.5 +Hex(4)HexNAc(4)NeuAc(1)@T 1751.624201 1752.5871 H(109)C(67)N(5)O(48) 1751.624201 H(109)C(67)N(5)O(48) O-linked glycosylation 1777 0.5 +Hex(4)HexNAc(4)NeuAc(1)@N 1751.624201 1752.5871 H(109)C(67)N(5)O(48) 1751.624201 H(109)C(67)N(5)O(48) N-linked glycosylation 1777 0.5 +dHex(2)Hex(4)HexNAc(4)@N 1752.644602 1753.6149 H(112)C(68)N(4)O(48) 1752.644602 H(112)C(68)N(4)O(48) N-linked glycosylation 1778 0.5 +dHex(2)Hex(4)HexNAc(4)@T 1752.644602 1753.6149 H(112)C(68)N(4)O(48) 1752.644602 H(112)C(68)N(4)O(48) O-linked glycosylation 1778 0.5 +dHex(2)Hex(4)HexNAc(4)@S 1752.644602 1753.6149 H(112)C(68)N(4)O(48) 1752.644602 H(112)C(68)N(4)O(48) O-linked glycosylation 1778 0.5 +Hex(6)HexNAc(4)@S 1784.634431 1785.6137 H(112)C(68)N(4)O(50) 1784.634431 H(112)C(68)N(4)O(50) O-linked glycosylation 1779 0.5 +Hex(6)HexNAc(4)@T 1784.634431 1785.6137 H(112)C(68)N(4)O(50) 1784.634431 H(112)C(68)N(4)O(50) O-linked glycosylation 1779 0.5 +Hex(6)HexNAc(4)@N 1784.634431 1785.6137 H(112)C(68)N(4)O(50) 1784.634431 H(112)C(68)N(4)O(50) N-linked glycosylation 1779 0.5 +Hex(5)HexNAc(5)@S 1825.660981 1826.6656 H(115)C(70)N(5)O(50) 1825.660981 H(115)C(70)N(5)O(50) O-linked glycosylation 1780 0.5 +Hex(5)HexNAc(5)@T 1825.660981 1826.6656 H(115)C(70)N(5)O(50) 1825.660981 H(115)C(70)N(5)O(50) O-linked glycosylation 1780 0.5 +Hex(5)HexNAc(5)@N 1825.660981 1826.6656 H(115)C(70)N(5)O(50) 1825.660981 H(115)C(70)N(5)O(50) N-linked glycosylation 1780 0.5 +dHex(1)Hex(3)HexNAc(6)@S 1850.692615 1851.7181 H(118)C(72)N(6)O(49) 1850.692615 H(118)C(72)N(6)O(49) O-linked glycosylation 1781 0.5 +dHex(1)Hex(3)HexNAc(6)@T 1850.692615 1851.7181 H(118)C(72)N(6)O(49) 1850.692615 H(118)C(72)N(6)O(49) O-linked glycosylation 1781 0.5 +dHex(1)Hex(3)HexNAc(6)@N 1850.692615 1851.7181 H(118)C(72)N(6)O(49) 1850.692615 H(118)C(72)N(6)O(49) N-linked glycosylation 1781 0.5 +dHex(1)Hex(4)HexNAc(4)NeuAc(1)@N 1897.68211 1898.7283 H(119)C(73)N(5)O(52) 1897.68211 H(119)C(73)N(5)O(52) N-linked glycosylation 1782 0.5 +dHex(1)Hex(4)HexNAc(4)NeuAc(1)@T 1897.68211 1898.7283 H(119)C(73)N(5)O(52) 1897.68211 H(119)C(73)N(5)O(52) O-linked glycosylation 1782 0.5 +dHex(1)Hex(4)HexNAc(4)NeuAc(1)@S 1897.68211 1898.7283 H(119)C(73)N(5)O(52) 1897.68211 H(119)C(73)N(5)O(52) O-linked glycosylation 1782 0.5 +dHex(3)Hex(4)HexNAc(4)@S 1898.702511 1899.7561 H(122)C(74)N(4)O(52) 1898.702511 H(122)C(74)N(4)O(52) O-linked glycosylation 1783 0.5 +dHex(3)Hex(4)HexNAc(4)@T 1898.702511 1899.7561 H(122)C(74)N(4)O(52) 1898.702511 H(122)C(74)N(4)O(52) O-linked glycosylation 1783 0.5 +dHex(3)Hex(4)HexNAc(4)@N 1898.702511 1899.7561 H(122)C(74)N(4)O(52) 1898.702511 H(122)C(74)N(4)O(52) N-linked glycosylation 1783 0.5 +dHex(1)Hex(3)HexNAc(5)NeuAc(1)@S 1938.708659 1939.7802 H(122)C(75)N(6)O(52) 1938.708659 H(122)C(75)N(6)O(52) O-linked glycosylation 1784 0.5 +dHex(1)Hex(3)HexNAc(5)NeuAc(1)@T 1938.708659 1939.7802 H(122)C(75)N(6)O(52) 1938.708659 H(122)C(75)N(6)O(52) O-linked glycosylation 1784 0.5 +dHex(1)Hex(3)HexNAc(5)NeuAc(1)@N 1938.708659 1939.7802 H(122)C(75)N(6)O(52) 1938.708659 H(122)C(75)N(6)O(52) N-linked glycosylation 1784 0.5 +dHex(2)Hex(4)HexNAc(5)@S 1955.723975 1956.8074 H(125)C(76)N(5)O(53) 1955.723975 H(125)C(76)N(5)O(53) O-linked glycosylation 1785 0.5 +dHex(2)Hex(4)HexNAc(5)@T 1955.723975 1956.8074 H(125)C(76)N(5)O(53) 1955.723975 H(125)C(76)N(5)O(53) O-linked glycosylation 1785 0.5 +dHex(2)Hex(4)HexNAc(5)@N 1955.723975 1956.8074 H(125)C(76)N(5)O(53) 1955.723975 H(125)C(76)N(5)O(53) N-linked glycosylation 1785 0.5 +NQIGG@K 469.228496 469.4921 H(31)C(19)N(7)O(7) 0.0 Post-translational 1799 0.0 +Carboxyethylpyrrole@K 122.036779 122.1213 H(6)C(7)O(2) 0.0 Other 1800 0.0 +Fluorescein-tyramine@Y 493.116152 493.4637 H(19)C(29)N(1)O(7) 0.0 Chemical derivative 1801 0.0 +dHex(1)Hex(7)HexNAc(4)@N 2092.745164 2093.8955 H(132)C(80)N(4)O(59) 0.0 N-linked glycosylation 1840 0.0 +betaFNA@C 454.210387 454.5155 H(30)C(25)N(2)O(6) 0.0 Chemical derivative 1839 0.0 +betaFNA@K 454.210387 454.5155 H(30)C(25)N(2)O(6) 0.0 Chemical derivative 1839 0.0 +Brij58@Any_N-term 224.250401 224.4253 H(32)C(16) 0.0 Other 1838 0.0 +Brij35@Any_N-term 168.187801 168.319 H(24)C(12) 0.0 Other 1837 0.0 +Triton@Any_N-term 188.156501 188.3086 H(20)C(14) 0.0 Other 1836 0.0 +Triton@Any_C-term 188.156501 188.3086 H(20)C(14) 0.0 Other 1836 0.0 +Tween80@Any_C-term 263.237491 263.4381 H(31)C(18)O(1) 0.0 Other 1835 0.0 +Tween20@Any_N-term 165.164326 165.2951 H(21)C(12) 0.0 Other 1834 0.0 +Tris@N 104.071154 104.1277 H(10)C(4)N(1)O(2) 0.0 Artefact 1831 0.0 +Biotin-tyramide@Y 361.146012 361.4585 H(23)C(18)N(3)O(3)S(1) 0.0 Chemical derivative 1830 0.0 +Biotin-tyramide@W 361.146012 361.4585 H(23)C(18)N(3)O(3)S(1) 0.0 Chemical derivative 1830 0.0 +Biotin-tyramide@C 361.146012 361.4585 H(23)C(18)N(3)O(3)S(1) 0.0 Chemical derivative 1830 0.0 +LRGG+dimethyl@K 411.259403 411.4991 H(33)C(18)N(7)O(4) 0.0 Post-translational 1829 0.0 +RNPXL@R^Any_N-term 324.035867 324.1813 H(13)C(9)N(2)O(9)P(1) 324.035867 H(13)C(9)N(2)O(9)P(1) Other 1825 0.5 +RNPXL@K^Any_N-term 324.035867 324.1813 H(13)C(9)N(2)O(9)P(1) 324.035867 H(13)C(9)N(2)O(9)P(1) Other 1825 0.5 +GEE@Q 86.036779 86.0892 H(6)C(4)O(2) 0.0 Chemical derivative 1824 0.0 +Glu->pyro-Glu+Methyl@E^Any_N-term -3.994915 -3.9887 C(1)O(-1) 0.0 Artefact 1826 0.0 +Glu->pyro-Glu+Methyl:2H(2)13C(1)@E^Any_N-term -0.979006 -0.9837 H(-2)2H(2)13C(1)O(-1) 0.0 Artefact 1827 0.0 +LRGG+methyl@K 397.243753 397.4725 H(31)C(17)N(7)O(4) 0.0 Post-translational 1828 0.0 +NP40@Any_N-term 220.182715 220.3505 H(24)C(15)O(1) 0.0 Other 1833 0.0 +IASD@C 452.034807 452.4582 H(16)C(18)N(2)O(8)S(2) 0.0 Chemical derivative 1832 0.0 +Biotin:Thermo-21328@K 389.090154 389.5564 H(23)C(15)N(3)O(3)S(3) 0.0 Chemical derivative 1841 0.0 +Biotin:Thermo-21328@Any_N-term 389.090154 389.5564 H(23)C(15)N(3)O(3)S(3) 0.0 Chemical derivative 1841 0.0 +PhosphoCytidine@Y 305.041287 305.1812 H(12)C(9)N(3)O(7)P(1) 0.0 Post-translational 1843 0.0 +PhosphoCytidine@T 305.041287 305.1812 H(12)C(9)N(3)O(7)P(1) 0.0 Post-translational 1843 0.0 +PhosphoCytidine@S 305.041287 305.1812 H(12)C(9)N(3)O(7)P(1) 0.0 Post-translational 1843 0.0 +AzidoF@F 41.001397 41.0122 H(-1)N(3) 0.0 Chemical derivative 1845 0.0 +Dimethylaminoethyl@C 71.073499 71.121 H(9)C(4)N(1) 0.0 Chemical derivative 1846 0.0 +Gluratylation@K 114.031694 114.0993 H(6)C(5)O(3) 0.0 Post-translational 1848 0.0 +hydroxyisobutyryl@K 86.036779 86.0892 H(6)C(4)O(2) 0.0 Post-translational 1849 CC(C)(O)C(=O)NCCCCC(N([Xe])[Xe])C([Rn])=O 0.0 +MeMePhosphorothioate@S 107.979873 108.0993 H(5)C(2)O(1)P(1)S(1) 0.0 Chemical derivative 1868 0.0 +Cation:Fe[III]@D 52.911464 52.8212 H(-3)Fe(1) 0.0 Artefact 1870 0.0 +Cation:Fe[III]@E 52.911464 52.8212 H(-3)Fe(1) 0.0 Artefact 1870 0.0 +Cation:Fe[III]@Any_C-term 52.911464 52.8212 H(-3)Fe(1) 0.0 Artefact 1870 0.0 +DTT@C 151.996571 152.2351 H(8)C(4)O(2)S(2) 0.0 Artefact 1871 0.0 +DYn-2@C 161.09664 161.2203 H(13)C(11)O(1) 0.0 Other 1872 0.0 +Xlink:DSSO[176]@K 176.01433 176.1903 H(8)C(6)O(4)S(1) 0.0 Chemical derivative 1878 0.0 +Xlink:DSSO[176]@Protein_N-term 176.01433 176.1903 H(8)C(6)O(4)S(1) 0.0 Chemical derivative 1878 0.0 +MesitylOxide@K 98.073165 98.143 H(10)C(6)O(1) 0.0 Chemical derivative 1873 0.0 +MesitylOxide@H 98.073165 98.143 H(10)C(6)O(1) 0.0 Chemical derivative 1873 0.0 +MesitylOxide@Protein_N-term 98.073165 98.143 H(10)C(6)O(1) 0.0 Chemical derivative 1873 0.0 +Xlink:DSS[259]@K 259.141973 259.2988 H(21)C(12)N(1)O(5) 0.0 Chemical derivative 1877 0.0 +Xlink:DSS[259]@Protein_N-term 259.141973 259.2988 H(21)C(12)N(1)O(5) 0.0 Chemical derivative 1877 0.0 +methylol@Y 30.010565 30.026 H(2)C(1)O(1) 0.0 Chemical derivative 1875 0.0 +methylol@W 30.010565 30.026 H(2)C(1)O(1) 0.0 Chemical derivative 1875 0.0 +methylol@K 30.010565 30.026 H(2)C(1)O(1) 0.0 Chemical derivative 1875 0.0 +Xlink:DSSO[175]@K 175.030314 175.2056 H(9)C(6)N(1)O(3)S(1) 0.0 Chemical derivative 1879 0.0 +Xlink:DSSO[175]@Protein_N-term 175.030314 175.2056 H(9)C(6)N(1)O(3)S(1) 0.0 Chemical derivative 1879 0.0 +Xlink:DSSO[279]@K 279.077658 279.3101 H(17)C(10)N(1)O(6)S(1) 0.0 Chemical derivative 1880 0.0 +Xlink:DSSO[279]@Protein_N-term 279.077658 279.3101 H(17)C(10)N(1)O(6)S(1) 0.0 Chemical derivative 1880 0.0 +Xlink:DSSO[54]@Protein_N-term 54.010565 54.0474 H(2)C(3)O(1) 0.0 Chemical derivative 1881 0.0 +Xlink:DSSO[54]@K 54.010565 54.0474 H(2)C(3)O(1) 0.0 Chemical derivative 1881 0.0 +Xlink:DSSO[86]@K 85.982635 86.1124 H(2)C(3)O(1)S(1) 0.0 Chemical derivative 1882 0.0 +Xlink:DSSO[86]@Protein_N-term 85.982635 86.1124 H(2)C(3)O(1)S(1) 0.0 Chemical derivative 1882 0.0 +Xlink:DSSO[104]@K 103.9932 104.1277 H(4)C(3)O(2)S(1) 0.0 Chemical derivative 1883 0.0 +Xlink:DSSO[104]@Protein_N-term 103.9932 104.1277 H(4)C(3)O(2)S(1) 0.0 Chemical derivative 1883 0.0 +Xlink:BuUrBu[111]@S 111.032028 111.0987 H(5)C(5)N(1)O(2) 0.0 Chemical derivative 1885 0.0 +Xlink:BuUrBu[111]@Protein_N-term 111.032028 111.0987 H(5)C(5)N(1)O(2) 0.0 Chemical derivative 1885 0.0 +Xlink:BuUrBu[111]@K 111.032028 111.0987 H(5)C(5)N(1)O(2) 0.0 Chemical derivative 1885 0.0 +Xlink:BuUrBu[111]@T 111.032028 111.0987 H(5)C(5)N(1)O(2) 0.0 Chemical derivative 1885 0.0 +Xlink:BuUrBu[111]@Y 111.032028 111.0987 H(5)C(5)N(1)O(2) 0.0 Chemical derivative 1885 0.0 +Xlink:BuUrBu[85]@S 85.052764 85.1045 H(7)C(4)N(1)O(1) 0.0 Chemical derivative 1886 0.0 +Xlink:BuUrBu[85]@Protein_N-term 85.052764 85.1045 H(7)C(4)N(1)O(1) 0.0 Chemical derivative 1886 0.0 +Xlink:BuUrBu[85]@K 85.052764 85.1045 H(7)C(4)N(1)O(1) 0.0 Chemical derivative 1886 0.0 +Xlink:BuUrBu[85]@T 85.052764 85.1045 H(7)C(4)N(1)O(1) 0.0 Chemical derivative 1886 0.0 +Xlink:BuUrBu[85]@Y 85.052764 85.1045 H(7)C(4)N(1)O(1) 0.0 Chemical derivative 1886 0.0 +Xlink:BuUrBu[214]@S 214.095357 214.2185 H(14)C(9)N(2)O(4) 0.0 Chemical derivative 1888 0.0 +Xlink:BuUrBu[214]@Protein_N-term 214.095357 214.2185 H(14)C(9)N(2)O(4) 0.0 Chemical derivative 1888 0.0 +Xlink:BuUrBu[214]@K 214.095357 214.2185 H(14)C(9)N(2)O(4) 0.0 Chemical derivative 1888 0.0 +Xlink:BuUrBu[214]@T 214.095357 214.2185 H(14)C(9)N(2)O(4) 0.0 Chemical derivative 1888 0.0 +Xlink:BuUrBu[214]@Y 214.095357 214.2185 H(14)C(9)N(2)O(4) 0.0 Chemical derivative 1888 0.0 +Xlink:BuUrBu[317]@S 317.158686 317.3382 H(23)C(13)N(3)O(6) 0.0 Chemical derivative 1889 0.0 +Xlink:BuUrBu[317]@Protein_N-term 317.158686 317.3382 H(23)C(13)N(3)O(6) 0.0 Chemical derivative 1889 0.0 +Xlink:BuUrBu[317]@K 317.158686 317.3382 H(23)C(13)N(3)O(6) 0.0 Chemical derivative 1889 0.0 +Xlink:BuUrBu[317]@T 317.158686 317.3382 H(23)C(13)N(3)O(6) 0.0 Chemical derivative 1889 0.0 +Xlink:BuUrBu[317]@Y 317.158686 317.3382 H(23)C(13)N(3)O(6) 0.0 Chemical derivative 1889 0.0 +Xlink:DSSO[158]@K 158.003765 158.175 H(6)C(6)O(3)S(1) 0.0 Chemical derivative 1896 0.0 +Xlink:DSSO[158]@Protein_N-term 158.003765 158.175 H(6)C(6)O(3)S(1) 0.0 Chemical derivative 1896 0.0 +Xlink:DSS[138]@K 138.06808 138.1638 H(10)C(8)O(2) 0.0 Chemical derivative 1898 0.0 +Xlink:DSS[138]@Protein_N-term 138.06808 138.1638 H(10)C(8)O(2) 0.0 Chemical derivative 1898 0.0 +Xlink:BuUrBu[196]@S 196.084792 196.2032 H(12)C(9)N(2)O(3) 0.0 Chemical derivative 1899 0.0 +Xlink:BuUrBu[196]@Protein_N-term 196.084792 196.2032 H(12)C(9)N(2)O(3) 0.0 Chemical derivative 1899 0.0 +Xlink:BuUrBu[196]@K 196.084792 196.2032 H(12)C(9)N(2)O(3) 0.0 Chemical derivative 1899 0.0 +Xlink:BuUrBu[196]@T 196.084792 196.2032 H(12)C(9)N(2)O(3) 0.0 Chemical derivative 1899 0.0 +Xlink:BuUrBu[196]@Y 196.084792 196.2032 H(12)C(9)N(2)O(3) 0.0 Chemical derivative 1899 0.0 +Xlink:DTBP[172]@K 172.01289 172.2711 H(8)C(6)N(2)S(2) 0.0 Chemical derivative 1900 0.0 +Xlink:DTBP[172]@Protein_N-term 172.01289 172.2711 H(8)C(6)N(2)S(2) 0.0 Chemical derivative 1900 0.0 +Xlink:DST[114]@K 113.995309 114.0563 H(2)C(4)O(4) 0.0 Chemical derivative 1901 0.0 +Xlink:DST[114]@Protein_N-term 113.995309 114.0563 H(2)C(4)O(4) 0.0 Chemical derivative 1901 0.0 +Xlink:DTSSP[174]@K 173.980921 174.2406 H(6)C(6)O(2)S(2) 0.0 Chemical derivative 1902 0.0 +Xlink:DTSSP[174]@Protein_N-term 173.980921 174.2406 H(6)C(6)O(2)S(2) 0.0 Chemical derivative 1902 0.0 +Xlink:SMCC[219]@C 219.089543 219.2365 H(13)C(12)N(1)O(3) 0.0 Chemical derivative 1903 0.0 +Xlink:SMCC[219]@K 219.089543 219.2365 H(13)C(12)N(1)O(3) 0.0 Chemical derivative 1903 0.0 +Xlink:SMCC[219]@Protein_N-term 219.089543 219.2365 H(13)C(12)N(1)O(3) 0.0 Chemical derivative 1903 0.0 +Cation:Al[III]@D 23.958063 23.9577 H(-3)Al(1) 0.0 Artefact 1910 0.0 +Cation:Al[III]@E 23.958063 23.9577 H(-3)Al(1) 0.0 Artefact 1910 0.0 +Cation:Al[III]@Any_C-term 23.958063 23.9577 H(-3)Al(1) 0.0 Artefact 1910 0.0 +Xlink:BS2G[113]@Protein_N-term 113.047679 113.1146 H(7)C(5)N(1)O(2) 0.0 Chemical derivative 1906 0.0 +Xlink:BS2G[113]@K 113.047679 113.1146 H(7)C(5)N(1)O(2) 0.0 Chemical derivative 1906 0.0 +Xlink:BS2G[114]@Protein_N-term 114.031694 114.0993 H(6)C(5)O(3) 0.0 Chemical derivative 1907 0.0 +Xlink:BS2G[114]@K 114.031694 114.0993 H(6)C(5)O(3) 0.0 Chemical derivative 1907 0.0 +Xlink:BS2G[217]@Protein_N-term 217.095023 217.2191 H(15)C(9)N(1)O(5) 0.0 Chemical derivative 1908 0.0 +Xlink:BS2G[217]@K 217.095023 217.2191 H(15)C(9)N(1)O(5) 0.0 Chemical derivative 1908 0.0 +Xlink:DMP[139]@K 139.110947 139.1982 H(13)C(7)N(3) 0.0 Chemical derivative 1911 0.0 +Xlink:DMP[139]@Protein_N-term 139.110947 139.1982 H(13)C(7)N(3) 0.0 Chemical derivative 1911 0.0 +Xlink:DMP[122]@K 122.084398 122.1677 H(10)C(7)N(2) 0.0 Chemical derivative 1912 0.0 +Xlink:DMP[122]@Protein_N-term 122.084398 122.1677 H(10)C(7)N(2) 0.0 Chemical derivative 1912 0.0 +glyoxalAGE@R 21.98435 22.0055 H(-2)C(2) 0.0 Post-translational 1913 0.0 +Met->AspSA@M -32.008456 -32.1081 H(-4)C(-1)O(1)S(-1) 0.0 Chemical derivative 1914 0.0 +Decarboxylation@D -30.010565 -30.026 H(-2)C(-1)O(-1) 0.0 Chemical derivative 1915 0.0 +Decarboxylation@E -30.010565 -30.026 H(-2)C(-1)O(-1) 0.0 Chemical derivative 1915 0.0 +Aspartylurea@H -10.031969 -10.0412 H(-2)C(-1)N(-2)O(2) 0.0 Chemical derivative 1916 0.0 +Formylasparagine@H 4.97893 4.9735 H(-1)C(-1)N(-1)O(2) 0.0 Chemical derivative 1917 0.0 +Carbonyl@S 13.979265 13.9835 H(-2)O(1) 0.0 Chemical derivative 1918 0.0 +Carbonyl@R 13.979265 13.9835 H(-2)O(1) 0.0 Chemical derivative 1918 0.0 +Carbonyl@Q 13.979265 13.9835 H(-2)O(1) 0.0 Chemical derivative 1918 0.0 +Carbonyl@L 13.979265 13.9835 H(-2)O(1) 0.0 Chemical derivative 1918 0.0 +Carbonyl@I 13.979265 13.9835 H(-2)O(1) 0.0 Chemical derivative 1918 0.0 +Carbonyl@E 13.979265 13.9835 H(-2)O(1) 0.0 Chemical derivative 1918 0.0 +Carbonyl@A 13.979265 13.9835 H(-2)O(1) 0.0 Chemical derivative 1918 0.0 +Carbonyl@V 13.979265 13.9835 H(-2)O(1) 0.0 Chemical derivative 1918 0.0 +Pro->HAVA@P 18.010565 18.0153 H(2)O(1) 0.0 Chemical derivative 1922 0.0 +AFB1_Dialdehyde@K 310.047738 310.2577 H(10)C(17)O(6) 0.0 Post-translational 1920 0.0 +Delta:H(-4)O(2)@W 27.958529 27.967 H(-4)O(2) 0.0 Chemical derivative 1923 0.0 +Delta:H(-4)O(3)@W 43.953444 43.9664 H(-4)O(3) 0.0 Chemical derivative 1924 0.0 +Delta:O(4)@W 63.979659 63.9976 O(4) 0.0 Artefact 1925 0.0 +Delta:H(4)C(3)O(2)@K 72.021129 72.0627 H(4)C(3)O(2) 0.0 Artefact 1926 0.0 +Delta:H(4)C(5)O(1)@R 80.026215 80.0847 H(4)C(5)O(1) 0.0 Chemical derivative 1927 0.0 +Delta:H(10)C(8)O(1)@K 122.073165 122.1644 H(10)C(8)O(1) 0.0 Artefact 1928 0.0 +Delta:H(6)C(7)O(4)@R 154.026609 154.1201 H(6)C(7)O(4) 0.0 Chemical derivative 1929 0.0 +Hex(2)Sulf(1)@T 404.062462 404.3444 H(20)C(12)O(13)S(1) 404.062462 H(20)C(12)O(13)S(1) O-linked glycosylation 1932 0.5 +Hex(2)Sulf(1)@S 404.062462 404.3444 H(20)C(12)O(13)S(1) 404.062462 H(20)C(12)O(13)S(1) O-linked glycosylation 1932 0.5 +Pent(2)@T 264.084518 264.2292 H(16)C(10)O(8) 264.084518 H(16)C(10)O(8) O-linked glycosylation 1930 0.5 +Pent(2)@S 264.084518 264.2292 H(16)C(10)O(8) 264.084518 H(16)C(10)O(8) O-linked glycosylation 1930 0.5 +Pent(1)HexNAc(1)@T 335.121631 335.3071 H(21)C(13)N(1)O(9) 335.121631 H(21)C(13)N(1)O(9) O-linked glycosylation 1931 0.5 +Pent(1)HexNAc(1)@S 335.121631 335.3071 H(21)C(13)N(1)O(9) 335.121631 H(21)C(13)N(1)O(9) O-linked glycosylation 1931 0.5 +Hex(1)Pent(2)Me(1)@T 440.152991 440.3964 H(28)C(17)O(13) 440.152991 H(28)C(17)O(13) O-linked glycosylation 1933 0.5 +Hex(1)Pent(2)Me(1)@S 440.152991 440.3964 H(28)C(17)O(13) 440.152991 H(28)C(17)O(13) O-linked glycosylation 1933 0.5 +HexNAc(2)Sulf(1)@S 486.11556 486.4482 H(26)C(16)N(2)O(13)S(1) 486.11556 H(26)C(16)N(2)O(13)S(1) O-linked glycosylation 1934 0.5 +HexNAc(2)Sulf(1)@T 486.11556 486.4482 H(26)C(16)N(2)O(13)S(1) 486.11556 H(26)C(16)N(2)O(13)S(1) O-linked glycosylation 1934 0.5 +Hex(1)Pent(3)Me(1)@S 572.19525 572.511 H(36)C(22)O(17) 572.19525 H(36)C(22)O(17) O-linked glycosylation 1935 0.5 +Hex(1)Pent(3)Me(1)@T 572.19525 572.511 H(36)C(22)O(17) 572.19525 H(36)C(22)O(17) O-linked glycosylation 1935 0.5 +Hex(2)Pent(2)@S 588.190165 588.5104 H(36)C(22)O(18) 588.190165 H(36)C(22)O(18) O-linked glycosylation 1936 0.5 +Hex(2)Pent(2)@T 588.190165 588.5104 H(36)C(22)O(18) 588.190165 H(36)C(22)O(18) O-linked glycosylation 1936 0.5 +Hex(2)Pent(2)Me(1)@S 602.205815 602.537 H(38)C(23)O(18) 602.205815 H(38)C(23)O(18) O-linked glycosylation 1937 0.5 +Hex(2)Pent(2)Me(1)@T 602.205815 602.537 H(38)C(23)O(18) 602.205815 H(38)C(23)O(18) O-linked glycosylation 1937 0.5 +Hex(4)HexA(1)@S 824.243382 824.6865 H(48)C(30)O(26) 824.243382 H(48)C(30)O(26) O-linked glycosylation 1938 0.5 +Hex(4)HexA(1)@T 824.243382 824.6865 H(48)C(30)O(26) 824.243382 H(48)C(30)O(26) O-linked glycosylation 1938 0.5 +Hex(2)HexNAc(1)Pent(1)HexA(1)@S 835.259366 835.7125 H(49)C(31)N(1)O(25) 835.259366 H(49)C(31)N(1)O(25) O-linked glycosylation 1939 0.5 +Hex(2)HexNAc(1)Pent(1)HexA(1)@T 835.259366 835.7125 H(49)C(31)N(1)O(25) 835.259366 H(49)C(31)N(1)O(25) O-linked glycosylation 1939 0.5 +Hex(3)HexNAc(1)HexA(1)@S 865.269931 865.7384 H(51)C(32)N(1)O(26) 865.269931 H(51)C(32)N(1)O(26) O-linked glycosylation 1940 0.5 +Hex(3)HexNAc(1)HexA(1)@T 865.269931 865.7384 H(51)C(32)N(1)O(26) 865.269931 H(51)C(32)N(1)O(26) O-linked glycosylation 1940 0.5 +Hex(1)HexNAc(2)dHex(2)Sulf(1)@S 940.284201 940.8712 H(56)C(34)N(2)O(26)S(1) 940.284201 H(56)C(34)N(2)O(26)S(1) O-linked glycosylation 1941 0.5 +Hex(1)HexNAc(2)dHex(2)Sulf(1)@T 940.284201 940.8712 H(56)C(34)N(2)O(26)S(1) 940.284201 H(56)C(34)N(2)O(26)S(1) O-linked glycosylation 1941 0.5 +HexA(2)HexNAc(3)@S 961.302294 961.8258 H(55)C(36)N(3)O(27) 961.302294 H(55)C(36)N(3)O(27) O-linked glycosylation 1942 0.5 +HexA(2)HexNAc(3)@T 961.302294 961.8258 H(55)C(36)N(3)O(27) 961.302294 H(55)C(36)N(3)O(27) O-linked glycosylation 1942 0.5 +dHex(1)Hex(4)HexA(1)@T 970.301291 970.8277 H(58)C(36)O(30) 970.301291 H(58)C(36)O(30) O-linked glycosylation 1943 0.5 +dHex(1)Hex(4)HexA(1)@S 970.301291 970.8277 H(58)C(36)O(30) 970.301291 H(58)C(36)O(30) O-linked glycosylation 1943 0.5 +Hex(5)HexA(1)@S 986.296206 986.8271 H(58)C(36)O(31) 986.296206 H(58)C(36)O(31) O-linked glycosylation 1944 0.5 +Hex(5)HexA(1)@T 986.296206 986.8271 H(58)C(36)O(31) 986.296206 H(58)C(36)O(31) O-linked glycosylation 1944 0.5 +Hex(4)HexA(1)HexNAc(1)@T 1027.322755 1027.879 H(61)C(38)N(1)O(31) 1027.322755 H(61)C(38)N(1)O(31) O-linked glycosylation 1945 0.5 +Hex(4)HexA(1)HexNAc(1)@S 1027.322755 1027.879 H(61)C(38)N(1)O(31) 1027.322755 H(61)C(38)N(1)O(31) O-linked glycosylation 1945 0.5 +dHex(3)Hex(3)HexNAc(1)@T 1127.41157 1128.0379 H(73)C(44)N(1)O(32) 1127.41157 H(73)C(44)N(1)O(32) O-linked glycosylation 1946 0.5 +dHex(3)Hex(3)HexNAc(1)@S 1127.41157 1128.0379 H(73)C(44)N(1)O(32) 1127.41157 H(73)C(44)N(1)O(32) O-linked glycosylation 1946 0.5 +Hex(6)HexNAc(1)@N 1175.396314 1176.0361 H(73)C(44)N(1)O(35) 1175.396314 H(73)C(44)N(1)O(35) N-linked glycosylation 1947 0.5 +Hex(1)HexNAc(4)dHex(1)Sulf(1)@T 1200.385037 1201.1151 H(72)C(44)N(4)O(32)S(1) 1200.385037 H(72)C(44)N(4)O(32)S(1) O-linked glycosylation 1948 0.5 +Hex(1)HexNAc(4)dHex(1)Sulf(1)@S 1200.385037 1201.1151 H(72)C(44)N(4)O(32)S(1) 1200.385037 H(72)C(44)N(4)O(32)S(1) O-linked glycosylation 1948 0.5 +dHex(1)Hex(2)HexNAc(1)NeuAc(2)@T 1255.433762 1256.1241 H(77)C(48)N(3)O(35) 1255.433762 H(77)C(48)N(3)O(35) O-linked glycosylation 1949 0.5 +dHex(1)Hex(2)HexNAc(1)NeuAc(2)@S 1255.433762 1256.1241 H(77)C(48)N(3)O(35) 1255.433762 H(77)C(48)N(3)O(35) O-linked glycosylation 1949 0.5 +dHex(3)Hex(3)HexNAc(2)@T 1330.490942 1331.2304 H(86)C(52)N(2)O(37) 1330.490942 H(86)C(52)N(2)O(37) O-linked glycosylation 1950 0.5 +dHex(3)Hex(3)HexNAc(2)@S 1330.490942 1331.2304 H(86)C(52)N(2)O(37) 1330.490942 H(86)C(52)N(2)O(37) O-linked glycosylation 1950 0.5 +dHex(2)Hex(1)HexNAc(4)Sulf(1)@T 1346.442946 1347.2563 H(82)C(50)N(4)O(36)S(1) 1346.442946 H(82)C(50)N(4)O(36)S(1) O-linked glycosylation 1951 0.5 +dHex(2)Hex(1)HexNAc(4)Sulf(1)@S 1346.442946 1347.2563 H(82)C(50)N(4)O(36)S(1) 1346.442946 H(82)C(50)N(4)O(36)S(1) O-linked glycosylation 1951 0.5 +dHex(1)Hex(2)HexNAc(4)Sulf(2)@T 1442.394675 1443.3189 H(82)C(50)N(4)O(40)S(2) 1442.394675 H(82)C(50)N(4)O(40)S(2) O-linked glycosylation 1952 0.5 +dHex(1)Hex(2)HexNAc(4)Sulf(2)@S 1442.394675 1443.3189 H(82)C(50)N(4)O(40)S(2) 1442.394675 H(82)C(50)N(4)O(40)S(2) O-linked glycosylation 1952 0.5 +Hex(9)@N 1458.475412 1459.2654 H(90)C(54)O(45) 1458.475412 H(90)C(54)O(45) N-linked glycosylation 1953 0.5 +dHex(2)Hex(3)HexNAc(3)Sulf(1)@T 1467.469221 1468.345 H(89)C(54)N(3)O(41)S(1) 1467.469221 H(89)C(54)N(3)O(41)S(1) O-linked glycosylation 1954 0.5 +dHex(2)Hex(3)HexNAc(3)Sulf(1)@S 1467.469221 1468.345 H(89)C(54)N(3)O(41)S(1) 1467.469221 H(89)C(54)N(3)O(41)S(1) O-linked glycosylation 1954 0.5 +dHex(2)Hex(5)HexNAc(2)Me(1)@T 1522.554331 1523.397 H(98)C(59)N(2)O(43) 1522.554331 H(98)C(59)N(2)O(43) O-linked glycosylation 1955 0.5 +dHex(2)Hex(5)HexNAc(2)Me(1)@S 1522.554331 1523.397 H(98)C(59)N(2)O(43) 1522.554331 H(98)C(59)N(2)O(43) O-linked glycosylation 1955 0.5 +dHex(2)Hex(2)HexNAc(4)Sulf(2)@T 1588.452584 1589.4601 H(92)C(56)N(4)O(44)S(2) 1588.452584 H(92)C(56)N(4)O(44)S(2) O-linked glycosylation 1956 0.5 +dHex(2)Hex(2)HexNAc(4)Sulf(2)@S 1588.452584 1589.4601 H(92)C(56)N(4)O(44)S(2) 1588.452584 H(92)C(56)N(4)O(44)S(2) O-linked glycosylation 1956 0.5 +Hex(9)HexNAc(1)@N 1661.554784 1662.4579 H(103)C(62)N(1)O(50) 1661.554784 H(103)C(62)N(1)O(50) N-linked glycosylation 1957 0.5 +dHex(3)Hex(2)HexNAc(4)Sulf(2)@S 1734.510493 1735.6013 H(102)C(62)N(4)O(48)S(2) 1734.510493 H(102)C(62)N(4)O(48)S(2) O-linked glycosylation 1958 0.5 +dHex(3)Hex(2)HexNAc(4)Sulf(2)@T 1734.510493 1735.6013 H(102)C(62)N(4)O(48)S(2) 1734.510493 H(102)C(62)N(4)O(48)S(2) O-linked glycosylation 1958 0.5 +Hex(4)HexNAc(4)NeuGc(1)@N 1767.619116 1768.5865 H(109)C(67)N(5)O(49) 1767.619116 H(109)C(67)N(5)O(49) N-linked glycosylation 1959 0.5 +Hex(4)HexNAc(4)NeuGc(1)@S 1767.619116 1768.5865 H(109)C(67)N(5)O(49) 1767.619116 H(109)C(67)N(5)O(49) O-linked glycosylation 1959 0.5 +Hex(4)HexNAc(4)NeuGc(1)@T 1767.619116 1768.5865 H(109)C(67)N(5)O(49) 1767.619116 H(109)C(67)N(5)O(49) O-linked glycosylation 1959 0.5 +dHex(4)Hex(3)HexNAc(2)NeuAc(1)@T 1767.644268 1768.6262 H(113)C(69)N(3)O(49) 1767.644268 H(113)C(69)N(3)O(49) O-linked glycosylation 1960 0.5 +dHex(4)Hex(3)HexNAc(2)NeuAc(1)@S 1767.644268 1768.6262 H(113)C(69)N(3)O(49) 1767.644268 H(113)C(69)N(3)O(49) O-linked glycosylation 1960 0.5 +Hex(3)HexNAc(5)NeuAc(1)@N 1792.65075 1793.639 H(112)C(69)N(6)O(48) 1792.65075 H(112)C(69)N(6)O(48) N-linked glycosylation 1961 0.5 +Hex(10)HexNAc(1)@N 1823.607608 1824.5985 H(113)C(68)N(1)O(55) 1823.607608 H(113)C(68)N(1)O(55) N-linked glycosylation 1962 0.5 +dHex(1)Hex(8)HexNAc(2)@N 1848.639242 1849.651 H(116)C(70)N(2)O(54) 1848.639242 H(116)C(70)N(2)O(54) N-linked glycosylation 1963 0.5 +Hex(3)HexNAc(4)NeuAc(2)@N 1880.666794 1881.701 H(116)C(72)N(6)O(51) 1880.666794 H(116)C(72)N(6)O(51) N-linked glycosylation 1964 0.5 +dHex(2)Hex(3)HexNAc(4)NeuAc(1)@N 1881.687195 1882.7289 H(119)C(73)N(5)O(51) 1881.687195 H(119)C(73)N(5)O(51) N-linked glycosylation 1965 0.5 +dHex(2)Hex(2)HexNAc(6)Sulf(1)@S 1914.654515 1915.7819 H(118)C(72)N(6)O(51)S(1) 1914.654515 H(118)C(72)N(6)O(51)S(1) O-linked glycosylation 1966 0.5 +dHex(2)Hex(2)HexNAc(6)Sulf(1)@T 1914.654515 1915.7819 H(118)C(72)N(6)O(51)S(1) 1914.654515 H(118)C(72)N(6)O(51)S(1) O-linked glycosylation 1966 0.5 +Hex(5)HexNAc(4)NeuAc(1)Ac(1)@N 1955.687589 1956.7643 H(121)C(75)N(5)O(54) 1955.687589 H(121)C(75)N(5)O(54) N-linked glycosylation 1967 0.5 +Hex(3)HexNAc(3)NeuAc(3)@S 1968.682838 1969.7631 H(120)C(75)N(6)O(54) 1968.682838 H(120)C(75)N(6)O(54) O-linked glycosylation 1968 0.5 +Hex(3)HexNAc(3)NeuAc(3)@T 1968.682838 1969.7631 H(120)C(75)N(6)O(54) 1968.682838 H(120)C(75)N(6)O(54) O-linked glycosylation 1968 0.5 +Hex(5)HexNAc(4)NeuAc(1)Ac(2)@N 1997.698154 1998.801 H(123)C(77)N(5)O(55) 1997.698154 H(123)C(77)N(5)O(55) N-linked glycosylation 1969 0.5 +Unknown:162@Any_C-term 162.125595 162.2267 H(18)C(8)O(3) 0.0 Artefact 1970 0.0 +Unknown:162@E 162.125595 162.2267 H(18)C(8)O(3) 0.0 Artefact 1970 0.0 +Unknown:162@D 162.125595 162.2267 H(18)C(8)O(3) 0.0 Artefact 1970 0.0 +Unknown:162@Any_N-term 162.125595 162.2267 H(18)C(8)O(3) 0.0 Artefact 1970 0.0 +Unknown:177@D 176.744957 176.4788 H(-7)O(1)Fe(3) 0.0 Artefact 1971 0.0 +Unknown:177@E 176.744957 176.4788 H(-7)O(1)Fe(3) 0.0 Artefact 1971 0.0 +Unknown:177@Any_C-term 176.744957 176.4788 H(-7)O(1)Fe(3) 0.0 Artefact 1971 0.0 +Unknown:177@Any_N-term 176.744957 176.4788 H(-7)O(1)Fe(3) 0.0 Artefact 1971 0.0 +Unknown:210@D 210.16198 210.3126 H(22)C(13)O(2) 0.0 Artefact 1972 0.0 +Unknown:210@E 210.16198 210.3126 H(22)C(13)O(2) 0.0 Artefact 1972 0.0 +Unknown:210@Any_C-term 210.16198 210.3126 H(22)C(13)O(2) 0.0 Artefact 1972 0.0 +Unknown:210@Any_N-term 210.16198 210.3126 H(22)C(13)O(2) 0.0 Artefact 1972 0.0 +Unknown:216@D 216.099774 216.231 H(16)C(10)O(5) 0.0 Artefact 1973 0.0 +Unknown:216@E 216.099774 216.231 H(16)C(10)O(5) 0.0 Artefact 1973 0.0 +Unknown:216@Any_C-term 216.099774 216.231 H(16)C(10)O(5) 0.0 Artefact 1973 0.0 +Unknown:216@Any_N-term 216.099774 216.231 H(16)C(10)O(5) 0.0 Artefact 1973 0.0 +Unknown:234@D 234.073953 234.2033 H(14)C(9)O(7) 0.0 Artefact 1974 0.0 +Unknown:234@E 234.073953 234.2033 H(14)C(9)O(7) 0.0 Artefact 1974 0.0 +Unknown:234@Any_C-term 234.073953 234.2033 H(14)C(9)O(7) 0.0 Artefact 1974 0.0 +Unknown:234@Any_N-term 234.073953 234.2033 H(14)C(9)O(7) 0.0 Artefact 1974 0.0 +Unknown:248@D 248.19876 248.359 H(28)C(13)O(4) 0.0 Artefact 1975 0.0 +Unknown:248@E 248.19876 248.359 H(28)C(13)O(4) 0.0 Artefact 1975 0.0 +Unknown:248@Any_C-term 248.19876 248.359 H(28)C(13)O(4) 0.0 Artefact 1975 0.0 +Unknown:248@Any_N-term 248.19876 248.359 H(28)C(13)O(4) 0.0 Artefact 1975 0.0 +Unknown:250@D 249.981018 250.2075 H(4)C(10)N(1)O(5)S(1) 0.0 Artefact 1976 0.0 +Unknown:250@E 249.981018 250.2075 H(4)C(10)N(1)O(5)S(1) 0.0 Artefact 1976 0.0 +Unknown:250@Any_C-term 249.981018 250.2075 H(4)C(10)N(1)O(5)S(1) 0.0 Artefact 1976 0.0 +Unknown:250@Any_N-term 249.981018 250.2075 H(4)C(10)N(1)O(5)S(1) 0.0 Artefact 1976 0.0 +Unknown:302@D 301.986514 302.2656 H(8)C(4)N(5)O(7)S(2) 0.0 Artefact 1977 0.0 +Unknown:302@E 301.986514 302.2656 H(8)C(4)N(5)O(7)S(2) 0.0 Artefact 1977 0.0 +Unknown:302@Any_C-term 301.986514 302.2656 H(8)C(4)N(5)O(7)S(2) 0.0 Artefact 1977 0.0 +Unknown:302@Any_N-term 301.986514 302.2656 H(8)C(4)N(5)O(7)S(2) 0.0 Artefact 1977 0.0 +Unknown:306@D 306.095082 306.2659 H(18)C(12)O(9) 0.0 Artefact 1978 0.0 +Unknown:306@E 306.095082 306.2659 H(18)C(12)O(9) 0.0 Artefact 1978 0.0 +Unknown:306@Any_C-term 306.095082 306.2659 H(18)C(12)O(9) 0.0 Artefact 1978 0.0 +Unknown:306@Any_N-term 306.095082 306.2659 H(18)C(12)O(9) 0.0 Artefact 1978 0.0 +Unknown:420@Any_N-term 420.051719 420.5888 H(24)C(12)N(2)O(6)S(4) 420.051719 H(24)C(12)N(2)O(6)S(4) Artefact 1979 0.5 +Unknown:420@Any_C-term 420.051719 420.5888 H(24)C(12)N(2)O(6)S(4) 420.051719 H(24)C(12)N(2)O(6)S(4) Artefact 1979 0.5 +Diethylphosphothione@Y 152.006087 152.1518 H(9)C(4)O(2)P(1)S(1) 0.0 Chemical derivative 1986 0.0 +Diethylphosphothione@T 152.006087 152.1518 H(9)C(4)O(2)P(1)S(1) 0.0 Chemical derivative 1986 0.0 +Diethylphosphothione@S 152.006087 152.1518 H(9)C(4)O(2)P(1)S(1) 0.0 Chemical derivative 1986 0.0 +Diethylphosphothione@K 152.006087 152.1518 H(9)C(4)O(2)P(1)S(1) 0.0 Chemical derivative 1986 0.0 +Diethylphosphothione@H 152.006087 152.1518 H(9)C(4)O(2)P(1)S(1) 0.0 Chemical derivative 1986 0.0 +Diethylphosphothione@C 152.006087 152.1518 H(9)C(4)O(2)P(1)S(1) 0.0 Chemical derivative 1986 0.0 +CIGG@K 330.136176 330.4032 H(22)C(13)N(4)O(4)S(1) 0.0 Post-translational 1990 0.0 +GNLLFLACYCIGG@K 1324.6308 1325.598 H(92)C(61)N(14)O(15)S(2) 0.0 Post-translational 1991 0.0 +Dimethylphosphothione@S 123.974787 124.0987 H(5)C(2)O(2)P(1)S(1) 0.0 Chemical derivative 1987 0.0 +Dimethylphosphothione@K 123.974787 124.0987 H(5)C(2)O(2)P(1)S(1) 0.0 Chemical derivative 1987 0.0 +Dimethylphosphothione@H 123.974787 124.0987 H(5)C(2)O(2)P(1)S(1) 0.0 Chemical derivative 1987 0.0 +Dimethylphosphothione@C 123.974787 124.0987 H(5)C(2)O(2)P(1)S(1) 0.0 Chemical derivative 1987 0.0 +Dimethylphosphothione@Y 123.974787 124.0987 H(5)C(2)O(2)P(1)S(1) 0.0 Chemical derivative 1987 0.0 +Dimethylphosphothione@T 123.974787 124.0987 H(5)C(2)O(2)P(1)S(1) 0.0 Chemical derivative 1987 0.0 +monomethylphosphothione@S 109.959137 110.0721 H(3)C(1)O(2)P(1)S(1) 0.0 Chemical derivative 1989 0.0 +monomethylphosphothione@K 109.959137 110.0721 H(3)C(1)O(2)P(1)S(1) 0.0 Chemical derivative 1989 0.0 +monomethylphosphothione@H 109.959137 110.0721 H(3)C(1)O(2)P(1)S(1) 0.0 Chemical derivative 1989 0.0 +monomethylphosphothione@C 109.959137 110.0721 H(3)C(1)O(2)P(1)S(1) 0.0 Chemical derivative 1989 0.0 +monomethylphosphothione@T 109.959137 110.0721 H(3)C(1)O(2)P(1)S(1) 0.0 Chemical derivative 1989 0.0 +monomethylphosphothione@Y 109.959137 110.0721 H(3)C(1)O(2)P(1)S(1) 0.0 Chemical derivative 1989 0.0 +TMPP-Ac:13C(9)@Y 581.211328 581.474 H(33)C(20)13C(9)O(10)P(1) 0.0 Artefact 1993 0.0 +TMPP-Ac:13C(9)@K 581.211328 581.474 H(33)C(20)13C(9)O(10)P(1) 0.0 Artefact 1993 0.0 +TMPP-Ac:13C(9)@Any_N-term 581.211328 581.474 H(33)C(20)13C(9)O(10)P(1) 0.0 Chemical derivative 1993 0.0 +Lys+O(2)@H 160.084792 160.1711 H(12)C(6)N(2)O(3) 0.0 Post-translational 2036 0.0 +ZQG@K 320.100836 320.2973 H(16)C(15)N(2)O(6) 134.036779 H(6)C(8)O(2) Chemical derivative 2001 0.5 +Xlink:DST[56]@Protein_N-term 55.989829 56.0202 C(2)O(2) 0.0 Chemical derivative 1999 0.0 +Xlink:DST[56]@K 55.989829 56.0202 C(2)O(2) 0.0 Chemical derivative 1999 0.0 +Haloxon@Y 203.950987 204.9763 H(7)C(4)O(3)P(1)Cl(2) 0.0 Chemical derivative 2006 0.0 +Haloxon@T 203.950987 204.9763 H(7)C(4)O(3)P(1)Cl(2) 0.0 Chemical derivative 2006 0.0 +Haloxon@S 203.950987 204.9763 H(7)C(4)O(3)P(1)Cl(2) 0.0 Chemical derivative 2006 0.0 +Haloxon@K 203.950987 204.9763 H(7)C(4)O(3)P(1)Cl(2) 0.0 Chemical derivative 2006 0.0 +Haloxon@H 203.950987 204.9763 H(7)C(4)O(3)P(1)Cl(2) 0.0 Chemical derivative 2006 0.0 +Haloxon@C 203.950987 204.9763 H(7)C(4)O(3)P(1)Cl(2) 0.0 Chemical derivative 2006 0.0 +Methamidophos-O@Y 92.997965 93.0217 H(4)C(1)N(1)O(2)P(1) 0.0 Chemical derivative 2008 0.0 +Methamidophos-O@T 92.997965 93.0217 H(4)C(1)N(1)O(2)P(1) 0.0 Chemical derivative 2008 0.0 +Methamidophos-O@S 92.997965 93.0217 H(4)C(1)N(1)O(2)P(1) 0.0 Chemical derivative 2008 0.0 +Methamidophos-O@K 92.997965 93.0217 H(4)C(1)N(1)O(2)P(1) 0.0 Chemical derivative 2008 0.0 +Methamidophos-O@H 92.997965 93.0217 H(4)C(1)N(1)O(2)P(1) 0.0 Chemical derivative 2008 0.0 +Methamidophos-O@C 92.997965 93.0217 H(4)C(1)N(1)O(2)P(1) 0.0 Chemical derivative 2008 0.0 +Nitrene@Y 12.995249 12.9988 H(-1)N(1) 0.0 Artefact 2014 0.0 +shTMT@Any_N-term 235.176741 235.2201 H(20)C(3)13C(9)15N(2)O(2) 0.0 Chemical derivative 2015 0.0 +shTMT@Protein_N-term 235.176741 235.2201 H(20)C(3)13C(9)15N(2)O(2) 0.0 Chemical derivative 2015 0.0 +shTMT@K 235.176741 235.2201 H(20)C(3)13C(9)15N(2)O(2) 0.0 Chemical derivative 2015 0.0 +TMTpro@T 304.207146 304.3127 H(25)C(8)13C(7)N(1)15N(2)O(3) 0.0 Isotopic label 2016 0.0 +TMTpro@S 304.207146 304.3127 H(25)C(8)13C(7)N(1)15N(2)O(3) 0.0 Isotopic label 2016 0.0 +TMTpro@H 304.207146 304.3127 H(25)C(8)13C(7)N(1)15N(2)O(3) 0.0 Isotopic label 2016 0.0 +TMTpro@Protein_N-term 304.207146 304.3127 H(25)C(8)13C(7)N(1)15N(2)O(3) 0.0 Isotopic label 2016 0.0 +TMTpro@Any_N-term 304.207146 304.3127 H(25)C(8)13C(7)N(1)15N(2)O(3) 0.0 Isotopic label 2016 0.0 +TMTpro@K 304.207146 304.3127 H(25)C(8)13C(7)N(1)15N(2)O(3) 0.0 Isotopic label 2016 0.0 +TMTpro_zero@S 295.189592 295.3773 H(25)C(15)N(3)O(3) 0.0 Chemical derivative 2017 0.0 +TMTpro_zero@H 295.189592 295.3773 H(25)C(15)N(3)O(3) 0.0 Chemical derivative 2017 0.0 +TMTpro_zero@Protein_N-term 295.189592 295.3773 H(25)C(15)N(3)O(3) 0.0 Chemical derivative 2017 0.0 +TMTpro_zero@Any_N-term 295.189592 295.3773 H(25)C(15)N(3)O(3) 0.0 Chemical derivative 2017 0.0 +TMTpro_zero@K 295.189592 295.3773 H(25)C(15)N(3)O(3) 0.0 Chemical derivative 2017 0.0 +TMTpro_zero@T 295.189592 295.3773 H(25)C(15)N(3)O(3) 0.0 Chemical derivative 2017 0.0 +3-hydroxybenzyl-phosphate@S 186.008196 186.1018 H(7)C(7)O(4)P(1) 0.0 Chemical derivative 2041 0.0 +3-hydroxybenzyl-phosphate@K 186.008196 186.1018 H(7)C(7)O(4)P(1) 0.0 Chemical derivative 2041 0.0 +3-hydroxybenzyl-phosphate@T 186.008196 186.1018 H(7)C(7)O(4)P(1) 0.0 Chemical derivative 2041 0.0 +3-hydroxybenzyl-phosphate@Y 186.008196 186.1018 H(7)C(7)O(4)P(1) 0.0 Chemical derivative 2041 0.0 +Hex(6)HexNAc(5)NeuAc(3)@N 2861.000054 2862.5699 H(176)C(109)N(8)O(79) 2861.000054 H(176)C(109)N(8)O(79) N-linked glycosylation 2028 0.5 +Andro-H2O@C 332.19876 332.4339 H(28)C(20)O(4) 0.0 Chemical derivative 2025 0.0 +His+O(2)@H 169.048741 169.1381 H(7)C(6)N(3)O(3) 0.0 Post-translational 2027 0.0 +Hex(7)HexNAc(6)@S 2352.846 2354.1393 H(148)C(90)N(6)O(65) 2352.846 H(148)C(90)N(6)O(65) O-linked glycosylation 2029 0.5 +Hex(7)HexNAc(6)@T 2352.846 2354.1393 H(148)C(90)N(6)O(65) 2352.846 H(148)C(90)N(6)O(65) O-linked glycosylation 2029 0.5 +Hex(7)HexNAc(6)@N 2352.846 2354.1393 H(148)C(90)N(6)O(65) 2352.846 H(148)C(90)N(6)O(65) N-linked glycosylation 2029 0.5 +Met+O(2)@H 163.030314 163.1949 H(9)C(5)N(1)O(3)S(1) 0.0 Chemical derivative 2033 0.0 +Gly+O(2)@H 89.011293 89.0501 H(3)C(2)N(1)O(3) 0.0 Chemical derivative 2034 0.0 +Glu+O(2)@H 161.032422 161.1128 H(7)C(5)N(1)O(5) 0.0 Post-translational 2037 0.0 +MBS+peptide@C 1482.77 1483.7597 H(108)C(81)N(7)O(19) 0.0 Chemical derivative 2040 0.0 +phenyl-phosphate@S 155.997631 156.0759 H(5)C(6)O(3)P(1) 0.0 Chemical derivative 2042 0.0 +phenyl-phosphate@K 155.997631 156.0759 H(5)C(6)O(3)P(1) 0.0 Chemical derivative 2042 0.0 +phenyl-phosphate@T 155.997631 156.0759 H(5)C(6)O(3)P(1) 0.0 Chemical derivative 2042 0.0 +phenyl-phosphate@Y 155.997631 156.0759 H(5)C(6)O(3)P(1) 0.0 Chemical derivative 2042 0.0 +RBS-ID_Uridine@Y 244.069536 244.2014 H(12)C(9)N(2)O(6) 0.0 Other 2044 0.0 +pRBS-ID_4-thiouridine@F 226.058972 226.1861 H(10)C(9)N(2)O(5) 132.042259 H(8)C(5)O(4) Other 2054 0.5 +Biotin:Aha-PC@M 690.24316 690.7246 H(38)C(29)N(8)O(10)S(1) 0.0 Chemical derivative 2053 0.0 +DBIA@C 296.184841 296.3654 H(24)C(14)N(4)O(3) 0.0 Chemical derivative 2062 0.0 +pRBS-ID_6-thioguanosine@W 265.081104 265.2254 H(11)C(10)N(5)O(4) 132.042259 H(8)C(5)O(4) Other 2055 0.5 +6C-CysPAT@Y 221.081695 221.1907 H(16)C(8)N(1)O(4)P(1) 0.0 Artefact 2057 0.0 +6C-CysPAT@T 221.081695 221.1907 H(16)C(8)N(1)O(4)P(1) 0.0 Artefact 2057 0.0 +6C-CysPAT@S 221.081695 221.1907 H(16)C(8)N(1)O(4)P(1) 0.0 Artefact 2057 0.0 +6C-CysPAT@E 221.081695 221.1907 H(16)C(8)N(1)O(4)P(1) 0.0 Artefact 2057 0.0 +6C-CysPAT@D 221.081695 221.1907 H(16)C(8)N(1)O(4)P(1) 0.0 Artefact 2057 0.0 +6C-CysPAT@H 221.081695 221.1907 H(16)C(8)N(1)O(4)P(1) 0.0 Artefact 2057 0.0 +6C-CysPAT@Any_N-term 221.081695 221.1907 H(16)C(8)N(1)O(4)P(1) 0.0 Artefact 2057 0.0 +6C-CysPAT@K 221.081695 221.1907 H(16)C(8)N(1)O(4)P(1) 0.0 Artefact 2057 0.0 +6C-CysPAT@C 221.081695 221.1907 H(16)C(8)N(1)O(4)P(1) 0.0 Chemical derivative 2057 0.0 +Xlink:DSPP[210]@Protein_N-term 209.97181 210.0802 H(3)C(8)O(5)P(1) 0.0 Chemical derivative 2058 0.0 +Xlink:DSPP[210]@K 209.97181 210.0802 H(3)C(8)O(5)P(1) 0.0 Chemical derivative 2058 0.0 +Xlink:DSPP[228]@Protein_N-term 227.982375 228.0955 H(5)C(8)O(6)P(1) 0.0 Chemical derivative 2059 0.0 +Xlink:DSPP[228]@K 227.982375 228.0955 H(5)C(8)O(6)P(1) 0.0 Chemical derivative 2059 0.0 +Xlink:DSPP[331]@Protein_N-term 331.045704 331.2152 H(14)C(12)N(1)O(8)P(1) 0.0 Chemical derivative 2060 0.0 +Xlink:DSPP[331]@K 331.045704 331.2152 H(14)C(12)N(1)O(8)P(1) 0.0 Chemical derivative 2060 0.0 +Xlink:DSPP[226]@K 225.990534 226.1028 H(5)C(8)N(1)O(5)P(1) 0.0 Chemical derivative 2061 0.0 +Xlink:DSPP[226]@Protein_N-term 225.990534 226.1028 H(5)C(8)N(1)O(5)P(1) 0.0 Chemical derivative 2061 0.0 +N6pAMP@Y 367.06817 367.2539 H(14)C(13)N(5)O(6)P(1) 0.0 Chemical derivative 2073 0.0 +N6pAMP@T 367.06817 367.2539 H(14)C(13)N(5)O(6)P(1) 0.0 Chemical derivative 2073 0.0 +N6pAMP@S 367.06817 367.2539 H(14)C(13)N(5)O(6)P(1) 0.0 Chemical derivative 2073 0.0 +DABCYL-C2-maleimide@K 391.16444 391.4231 H(21)C(21)N(5)O(3) 251.105862 H(13)C(15)N(3)O(1) Chemical derivative 2074 0.5 +DABCYL-C2-maleimide@C 391.16444 391.4231 H(21)C(21)N(5)O(3) 251.105862 H(13)C(15)N(3)O(1) Chemical derivative 2074 0.5 +Ethynyl@C 24.0 24.0214 C(2) 0.0 Chemical derivative 2081 0.0 +Mono_Nγ-propargyl-L-Gln_desthiobiotin@C 596.328211 596.6764 H(44)C(26)N(8)O(8) 0.0 Chemical derivative 2067 0.0 +Di_L-Glu_Nγ-propargyl-L-Gln_desthiobiotin@E 709.375889 709.7909 H(51)C(31)N(9)O(10) 469.301268 H(39)C(21)N(7)O(5) Chemical derivative 2068 0.5 +Di_L-Glu_Nγ-propargyl-L-Gln_desthiobiotin@D 709.375889 709.7909 H(51)C(31)N(9)O(10) 469.301268 H(39)C(21)N(7)O(5) Chemical derivative 2068 0.5 +Di_L-Gln_Nγ-propargyl-L-Gln_desthiobiotin@E 708.391873 708.8062 H(52)C(31)N(10)O(9) 726.402438 H(54)C(31)N(10)O(10) Chemical derivative 2069 0.5 +Di_L-Gln_Nγ-propargyl-L-Gln_desthiobiotin@D 708.391873 708.8062 H(52)C(31)N(10)O(9) 726.402438 H(54)C(31)N(10)O(10) Chemical derivative 2069 0.5 +L-Gln@D 128.058578 128.1292 H(8)C(5)N(2)O(2) 0.0 Post-translational 2070 0.0 +L-Gln@E 128.058578 128.1292 H(8)C(5)N(2)O(2) 0.0 Post-translational 2070 0.0 +Glyceroyl@Protein_N-term 88.016044 88.0621 H(4)C(3)O(3) 0.0 Post-translational 2072 0.0 +Glyceroyl@K 88.016044 88.0621 H(4)C(3)O(3) 0.0 Post-translational 2072 0.0 +NBF@R 163.001791 163.0904 H(1)C(6)N(3)O(3) 0.0 Chemical derivative 2079 0.0 +NBF@K 163.001791 163.0904 H(1)C(6)N(3)O(3) 0.0 Chemical derivative 2079 0.0 +NBF@C 163.001791 163.0904 H(1)C(6)N(3)O(3) 0.0 Chemical derivative 2079 0.0 +DCP@C 168.078644 168.1898 H(12)C(9)O(3) 0.0 Chemical derivative 2080 0.0 +QQTGG@K 471.207761 471.465 H(29)C(18)N(7)O(8) 0.0 Other 2082 0.0 +Pyro-QQTGG@K 454.181212 454.4344 H(26)C(18)N(6)O(8) 0.0 Other 2083 0.0 +NQTGG@K 457.192111 457.4384 H(27)C(17)N(7)O(8) 0.0 Other 2084 0.0 +DVFQQQTGG@K 960.43011 960.9865 H(60)C(41)N(12)O(15) 0.0 Other 2085 0.0 +iST-NHS_specific_cysteine_modification@C 113.084064 113.1576 H(11)C(6)N(1)O(1) 0.0 Chemical derivative 2086 0.0 +Label:13C(2)15N(1)@G 3.003745 2.9787 C(-2)13C(2)N(-1)15N(1) 0.0 Isotopic label 2088 0.0 +GlyGly@K 114.042927 114.1026 H(6)C(4)N(2)O(2) 0.0 Multiple 121 NCC(=O)NCC(=O)NCCCC[C@H](N([Xe])([Xe]))C([Rn])=O 1000000.0 +Pro->(2S,4R)-4-fluoroproline@P 0.0 0.0 F(1)H(-1) 0.0 User-added 0 F[C@@H]1C[C@H](N([Xe])C1)C(=O)[Rn] 0.0 +Pro->(2S,4S)-4fluoroproline@P 0.0 0.0 F(1)H(-1) 0.0 User-added 0 F[C@H]1C[C@H](N([Xe])C1)C(=O)[Rn] 0.0 +Pro->(2S)-1,3-thiazolidine-2-carboxylic_acid@P 0.0 0.0 C(-1)H(-2)S(1) 0.0 User-added 0 S1[C@H](N([Xe])CC1)C(=O)[Rn] 0.0 +Pro->(4R)-1,3-Thiazolidine-4-carboxylic_acid@P 0.0 0.0 C(-1)H(-2)S(1) 0.0 User-added 0 S1CN([Xe])[C@@H](C1)C(=O)[Rn] 0.0 +Pro->(2S,4R)-4-hydroxyproline@P 0.0 0.0 O(1) 0.0 User-added 0 O[C@@H]1C[C@H](N([Xe])C1)C(=O)[Rn] 0.0 +Pro->(DL)-pipecolic_acid@P 0.0 0.0 C(1)H(2) 0.0 User-added 0 C1CCN([Xe])C(C1)C(=O)[Rn] 0.0 +Pro->3,4-Dehydro-L-proline@P 0.0 0.0 H(-2) 0.0 User-added 0 C1C=CC(N1([Xe]))C(=O)[Rn] 0.0 +Pro->(1S,3S,5S)-2-Azabicyclo[3.1.0]hexane-3-carboxylic_acid@P 0.0 0.0 C(1) 0.0 User-added 0 [C@H]12N([Xe])[C@@H](C[C@@H]2C1)C(=O)[Rn] 0.0 +Pro->(1R,3S,5R)-2-Azabicyclo[3.1.0]hexane-3-carboxylic_acid@P 0.0 0.0 C(1) 0.0 User-added 0 [C@@H]12N([Xe])[C@@H](C[C@H]2C1)C(=O)[Rn] 0.0 +Pro->(2S,3aS,7aS)-Octahydro-1H-indole-2-carboxylic_acid@P 0.0 0.0 C(4)H(6) 0.0 User-added 0 N1([Xe])[C@@H](C[C@@H]2CCCC[C@H]12)C(=O)[Rn] 0.0 +Pro->(DL)-5-trifluoromethylproline@P 0.0 0.0 C(1)F(3)H(-1) 0.0 User-added 0 FC(C1CCC(N1([Xe]))C(=O)[Rn])(F)F 0.0 +mTRAQ@Protein_N-term 0.0 0.0 C(7)H(12)N(2)O(1) 0.0 User-added 0 C(=O)CN1CCN(CC1)C 0.0 +mTRAQ:13C(3)15N(1)@Protein_N-term 0.0 0.0 13C(3)15N(1)C(4)H(12)N(1)O(1) 0.0 User-added 0 C(=O)[13C]([H])([H])[15N]1[13C]([H])([H])[13C]([H])([H])N(CC1)C 0.0 +mTRAQ:13C(6)15N(2)@Protein_N-term 0.0 0.0 13C(6)15N(2)C(1)H(12)O(1) 0.0 User-added 0 C(=O)[13C]([H])([H])[15N]1[13C]([H])([H])[13C]([H])([H])[15N]([13C]([H])([H])[13C]1([H])([H]))[13C]([H])([H])([H]) 0.0 +Biotin@Protein_N-term 0.0 0.0 C(10)H(14)N(2)O(2)S(1) 0.0 User-added 0 C(=O)CCCCC1SCC2NC(=O)NC21 0.0 +Carbamidomethyl@Protein_N-term 0.0 0.0 C(2)H(3)N(1)O(1) 0.0 User-added 0 C(=O)NC 0.0 +Propionamide@Protein_N-term 0.0 0.0 C(3)H(5)N(1)O(1) 0.0 User-added 0 CCC(N)=O 0.0 +Pyridylacetyl@Protein_N-term 0.0 0.0 C(7)H(5)N(1)O(1) 0.0 User-added 0 C(=O)Cc1ccccn1 0.0 +Methyl@Protein_C-term 0.0 0.0 C(1)H(2) 0.0 User-added 0 OC 0.0 +Ethyl@Protein_C-term 0.0 0.0 C(2)H(4) 0.0 User-added 0 OCC 0.0 +Cation:Na@Protein_C-term 0.0 0.0 H(-1)Na(1) 0.0 User-added 0 O[Na] 0.0 +Cation:K@Protein_C-term 0.0 0.0 H(-1)K(1) 0.0 User-added 0 O[K] 0.0 +Cation:Cu[I]@Protein_C-term 0.0 0.0 Cu(1)H(-1) 0.0 User-added 0 O[Cu] 0.0 +Cation:Li@Protein_C-term 0.0 0.0 H(-1)Li(1) 0.0 User-added 0 O[Li] 0.0 From ab4352cae12c4fca677fd01df85aad5bf01740ed Mon Sep 17 00:00:00 2001 From: singjc Date: Wed, 14 May 2025 22:04:38 -0400 Subject: [PATCH 57/75] refactor: Update normalization field in Redeem CLI properties --- .../src/properties/inference/inference.rs | 117 +++++++++++++++++- .../src/properties/inference/input.rs | 5 + .../redeem-cli/src/properties/train/input.rs | 6 +- .../src/properties/train/trainer.rs | 5 +- .../src/models/model_interface.rs | 49 +++++--- 5 files changed, 156 insertions(+), 26 deletions(-) diff --git a/crates/redeem-cli/src/properties/inference/inference.rs b/crates/redeem-cli/src/properties/inference/inference.rs index 6be1dea..b171f77 100644 --- a/crates/redeem-cli/src/properties/inference/inference.rs +++ b/crates/redeem-cli/src/properties/inference/inference.rs @@ -1,16 +1,20 @@ use anyhow::{Context, Result}; +use maud::{PreEscaped, html}; use redeem_properties::models::ccs_cnn_lstm_model::CCSCNNLSTMModel; use redeem_properties::models::ccs_cnn_tf_model::CCSCNNTFModel; -use redeem_properties::models::ccs_model::load_collision_cross_section_model; use redeem_properties::models::model_interface::ModelInterface; use redeem_properties::models::rt_cnn_lstm_model::RTCNNLSTMModel; -use redeem_properties::models::rt_model::load_retention_time_model; use redeem_properties::utils::data_handling::{PeptideData, TargetNormalization}; -use redeem_properties::utils::peptdeep_utils::load_modifications; +use redeem_properties::utils::peptdeep_utils::{load_modifications, MODIFICATION_MAP}; use redeem_properties::utils::utils::get_device; +use report_builder::{ + Report, ReportSection, + plots::plot_scatter, +}; use crate::properties::inference::input::PropertyInferenceConfig; use crate::properties::inference::output::write_peptide_data; +use crate::properties::train::sample_peptides; use crate::properties::load_data::load_peptide_data; use crate::properties::util::write_bytes_to_file; @@ -23,7 +27,7 @@ pub fn run_inference(config: &PropertyInferenceConfig) -> Result<()> { &config.model_arch, Some(config.nce), Some(config.instrument.clone()), - Some("min_max".to_string()), + Some(config.normalization.clone().unwrap()), &modifications, )?; log::info!("Loaded {} peptides", inference_data.len()); @@ -90,6 +94,111 @@ pub fn run_inference(config: &PropertyInferenceConfig) -> Result<()> { log::info!("Predictions saved to: {}", config.output_file); write_peptide_data(&inference_results, &config.output_file)?; + // Generate report + let mut report = Report::new( + "ReDeeM", + &config.version, + Some("https://github.com/singjc/redeem/blob/master/img/redeem_logo.png?raw=true"), + &format!("ReDeeM {:?} Inference Report", config.model_arch), + ); + + /* Section 1: Overview */ + { + let mut overview_section = ReportSection::new("Overview"); + + overview_section.add_content(html! { + "This report summarizes the inference process of the ReDeeM model." + }); + + let modifications = MODIFICATION_MAP.clone(); + + let normalize_field = if config.model_arch.contains("ccs") { + "ccs" + } else { + "retention time" + }; + + // Inference scatter plot + let inference_data_sampled: Vec = sample_peptides(&inference_data, 5000); + + let (true_rt, pred_rt): (Vec, Vec) = inference_data_sampled + .iter() + .zip(&inference_results) + .filter_map(|(true_pep, pred_pep)| { + match normalize_field { + "ccs" => { + match (true_pep.ccs, pred_pep.ccs) { + (Some(t), Some(p)) => { + let t_denorm = match norm_factor { + TargetNormalization::ZScore(mean, std) => t as f64 * std as f64 + mean as f64, + TargetNormalization::MinMax(min, range) => t as f64 * range as f64 + min as f64, + TargetNormalization::None => t as f64, + }; + Some((t_denorm, p as f64)) + } + _ => None, + } + }, + _ => { + match (true_pep.retention_time, pred_pep.retention_time) { + (Some(t), Some(p)) => { + let t_denorm = match norm_factor { + TargetNormalization::ZScore(mean, std) => t as f64 * std as f64 + mean as f64, + TargetNormalization::MinMax(min, range) => t as f64 * range as f64 + min as f64, + TargetNormalization::None => t as f64, + }; + Some((t_denorm, p as f64)) + } + _ => None, + } + } + } + }) + .unzip(); + + + let scatter_plot = plot_scatter( + &vec![true_rt.clone()], + &vec![pred_rt.clone()], + vec!["Prediction".to_string()], + "Predicted vs True (Random 1000 Validation Peptides)", + "Target", + "Predicted", + ) + .unwrap(); + overview_section.add_plot(scatter_plot); + + report.add_section(overview_section); + } + + + /* Section 2: Configuration */ + { + let mut config_section = ReportSection::new("Configuration"); + config_section.add_content(html! { + style { + ".code-container { + background-color: #f5f5f5; + padding: 10px; + border-radius: 5px; + overflow-x: auto; + font-family: monospace; + white-space: pre-wrap; + }" + } + div class="code-container" { + pre { + code { (PreEscaped(serde_json::to_string_pretty(&config)?)) } + } + } + }); + report.add_section(config_section); + } + + // Save the report to HTML file + let path = "redeem_inference_report.html"; + report.save_to_file(&path.to_string())?; + let path = "redeem_inference_config.json"; let json = serde_json::to_string_pretty(&config)?; println!("{}", json); diff --git a/crates/redeem-cli/src/properties/inference/input.rs b/crates/redeem-cli/src/properties/inference/input.rs index fcdacd5..e8ee563 100644 --- a/crates/redeem-cli/src/properties/inference/input.rs +++ b/crates/redeem-cli/src/properties/inference/input.rs @@ -8,9 +8,11 @@ use crate::properties::util::validate_tsv_or_csv_file; #[derive(Debug, Deserialize, Serialize, Clone)] pub struct PropertyInferenceConfig { + pub version: String, pub model_path: String, pub inference_data: String, pub output_file: String, + pub normalization: Option, pub model_arch: String, pub device: String, pub batch_size: usize, @@ -21,9 +23,11 @@ pub struct PropertyInferenceConfig { impl Default for PropertyInferenceConfig { fn default() -> Self { PropertyInferenceConfig { + version: clap::crate_version!().to_string(), model_path: String::new(), inference_data: String::new(), output_file: String::from("redeem_inference.csv"), + normalization: Some(String::from("min_max")), model_arch: String::from("rt_cnn_tf"), device: String::from("cpu"), batch_size: 64, @@ -64,6 +68,7 @@ impl PropertyInferenceConfig { load_or_default!(model_path); load_or_default!(inference_data); load_or_default!(output_file); + load_or_default!(normalization); load_or_default!(model_arch); load_or_default!(device); load_or_default!(batch_size); diff --git a/crates/redeem-cli/src/properties/train/input.rs b/crates/redeem-cli/src/properties/train/input.rs index c8523b9..45356bc 100644 --- a/crates/redeem-cli/src/properties/train/input.rs +++ b/crates/redeem-cli/src/properties/train/input.rs @@ -13,7 +13,7 @@ pub struct PropertyTrainConfig { pub train_data: String, pub validation_data: Option, pub output_file: String, - pub rt_normalization: Option, + pub normalization: Option, pub model_arch: String, pub device: String, pub batch_size: usize, @@ -33,7 +33,7 @@ impl Default for PropertyTrainConfig { train_data: String::new(), validation_data: None, output_file: String::from("rt_cnn_tf.safetensors"), - rt_normalization: Some(String::from("min_max")), + normalization: Some(String::from("min_max")), model_arch: String::from("rt_cnn_tf"), device: String::from("cpu"), batch_size: 64, @@ -79,7 +79,7 @@ impl PropertyTrainConfig { load_or_default!(train_data); load_or_default!(validation_data); load_or_default!(output_file); - load_or_default!(rt_normalization); + load_or_default!(normalization); load_or_default!(model_arch); load_or_default!(device); load_or_default!(batch_size); diff --git a/crates/redeem-cli/src/properties/train/trainer.rs b/crates/redeem-cli/src/properties/train/trainer.rs index 701bf8e..66dcf2b 100644 --- a/crates/redeem-cli/src/properties/train/trainer.rs +++ b/crates/redeem-cli/src/properties/train/trainer.rs @@ -1,7 +1,6 @@ use anyhow::{Context, Result}; use maud::{PreEscaped, html}; use redeem_properties::models::model_interface::ModelInterface; -use redeem_properties::models::rt_model::load_retention_time_model; use redeem_properties::models::{ ccs_cnn_lstm_model::CCSCNNLSTMModel, ccs_cnn_tf_model::CCSCNNTFModel, rt_cnn_lstm_model::RTCNNLSTMModel, rt_cnn_transformer_model::RTCNNTFModel, @@ -33,7 +32,7 @@ pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { &config.model_arch, Some(config.nce), Some(config.instrument.clone()), - Some(config.rt_normalization.clone().unwrap()), + Some(config.normalization.clone().unwrap()), &modifications, )?; log::info!("Loaded {} training peptides", train_peptides.len()); @@ -45,7 +44,7 @@ pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { &config.model_arch, Some(config.nce), Some(config.instrument.clone()), - Some(config.rt_normalization.clone().unwrap()), + Some(config.normalization.clone().unwrap()), &modifications, ) .context("Failed to load validation data")?; diff --git a/crates/redeem-properties/src/models/model_interface.rs b/crates/redeem-properties/src/models/model_interface.rs index eec347a..ff83f6b 100644 --- a/crates/redeem-properties/src/models/model_interface.rs +++ b/crates/redeem-properties/src/models/model_interface.rs @@ -667,6 +667,16 @@ pub trait ModelInterface: Send + Sync + ModelClone { best_val_loss = avg_val_loss; epochs_without_improvement = 0; + // Check if the prior checkpoint exists, if it does delete it + let checkpoint_path = format!( + "redeem_{}_best_val_ckpt_model_epoch_{}.safetensors", + self.get_model_arch(), + epoch - 1 + ); + if PathBuf::from(&checkpoint_path).exists() { + std::fs::remove_file(&checkpoint_path)?; + } + let checkpoint_path = format!( "redeem_{}_best_val_ckpt_model_epoch_{}.safetensors", self.get_model_arch(), @@ -910,7 +920,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { (String, Option), crate::utils::peptdeep_utils::ModificationMap, >, - rt_norm: TargetNormalization, + target_norm: TargetNormalization, ) -> Result> { let num_batches = (inference_data.len() + batch_size - 1) / batch_size; info!( @@ -918,44 +928,44 @@ pub trait ModelInterface: Send + Sync + ModelClone { inference_data.len(), num_batches ); - + let progress = Progress::new(inference_data.len(), "[inference] Batch:"); let mut result: Vec> = vec![None; inference_data.len()]; - + inference_data .par_chunks(batch_size) .enumerate() .map(|(batch_idx, batch_data)| { let start_idx = batch_idx * batch_size; let batch: PeptideBatchData = batch_data.into(); - + let naked_sequences = &batch.naked_sequence; let mods = &batch.mods; let mod_sites = &batch.mod_sites; - + let charges = if batch.charges.iter().all(|c| c.is_some()) { Some(batch.charges.iter().map(|c| c.unwrap()).collect::>()) } else { None }; - + let nces = if batch.nces.iter().all(|n| n.is_some()) { Some(batch.nces.iter().map(|n| n.unwrap()).collect::>()) } else { None }; - + let instruments = if batch.instruments.iter().all(|i| i.is_some()) { Some(batch.instruments.clone()) } else { None }; - + let input_tensor = self .encode_peptides(naked_sequences, mods, mod_sites, charges, nces, instruments)? .to_device(self.get_device())?; let output = self.forward(&input_tensor)?; - + match self.property_type() { PropertyType::RT | PropertyType::CCS => { let predictions = output.to_vec1()?; @@ -966,7 +976,16 @@ pub trait ModelInterface: Send + Sync + ModelClone { let mut peptide = batch_data[i].clone(); match self.property_type() { PropertyType::RT => { - peptide.retention_time = Some(match rt_norm { + peptide.retention_time = Some(match target_norm { + TargetNormalization::ZScore(mean, std) => pred * std + mean, + TargetNormalization::MinMax(min, max) => { + pred * (max - min) + min + } + TargetNormalization::None => pred, + }); + } + PropertyType::CCS => { + peptide.ccs = Some(match target_norm { TargetNormalization::ZScore(mean, std) => pred * std + mean, TargetNormalization::MinMax(min, max) => { pred * (max - min) + min @@ -974,9 +993,8 @@ pub trait ModelInterface: Send + Sync + ModelClone { TargetNormalization::None => pred, }); } - PropertyType::CCS => peptide.ion_mobility = Some(pred), _ => {} - }; + } (start_idx + i, peptide) }) .collect(); @@ -994,10 +1012,11 @@ pub trait ModelInterface: Send + Sync + ModelClone { result[idx] = Some(peptide); progress.inc(); }); - + progress.finish(); Ok(result.into_iter().flatten().collect()) } + /// Extract encoded input and target tensor for a batch of peptides. fn prepare_batch_inputs( @@ -1008,8 +1027,6 @@ pub trait ModelInterface: Send + Sync + ModelClone { crate::utils::peptdeep_utils::ModificationMap, >, ) -> Result<(Tensor, Tensor)> { - use rayon::prelude::*; - let batch: PeptideBatchData = batch_data.into(); let naked_sequences = &batch.naked_sequence; @@ -1045,7 +1062,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { .map(|v| v.unwrap_or(0.0)) .collect(), PropertyType::CCS => batch - .ion_mobilities + .ccs .iter() .map(|v| v.unwrap_or(0.0)) .collect(), From 16bba8fb7b96762628b9c075693ce06790fb485e Mon Sep 17 00:00:00 2001 From: singjc Date: Wed, 14 May 2025 22:04:44 -0400 Subject: [PATCH 58/75] refactor: Update loading of modifications to use byte slice instead of file path --- crates/redeem-properties/src/utils/peptdeep_utils.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/redeem-properties/src/utils/peptdeep_utils.rs b/crates/redeem-properties/src/utils/peptdeep_utils.rs index d802348..d30b6ae 100644 --- a/crates/redeem-properties/src/utils/peptdeep_utils.rs +++ b/crates/redeem-properties/src/utils/peptdeep_utils.rs @@ -234,7 +234,7 @@ pub fn load_mod_to_feature_arc( /// /// # Example /// ``` -/// use easypqp_core::data_handling::remove_mass_shift; +/// use redeem_properties::utils::peptdeep_utils::remove_mass_shift; /// /// let peptide = "MGC[+57.0215]AAR"; /// assert_eq!(remove_mass_shift(peptide), "MGCAAR"); From 99ae99fcf85e009a590212ef9b92a1d9c8f095bd Mon Sep 17 00:00:00 2001 From: singjc Date: Thu, 15 May 2025 01:42:40 -0400 Subject: [PATCH 59/75] refactor: Update training configuration in redeem-properties crate --- .../src/properties/train/trainer.rs | 3 + .../redeem-properties/src/models/ccs_model.rs | 3 + .../src/models/model_interface.rs | 384 ++++++------------ .../redeem-properties/src/models/rt_model.rs | 3 + 4 files changed, 139 insertions(+), 254 deletions(-) diff --git a/crates/redeem-cli/src/properties/train/trainer.rs b/crates/redeem-cli/src/properties/train/trainer.rs index 66dcf2b..cd06a79 100644 --- a/crates/redeem-cli/src/properties/train/trainer.rs +++ b/crates/redeem-cli/src/properties/train/trainer.rs @@ -143,6 +143,9 @@ pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { config.learning_rate as f64, config.epochs, config.early_stopping_patience, + "training", + true, + true ).with_context(|| "Training failed: an error occurred during the model training process")?; log::info!("Training completed in {:?}", start_time.elapsed()); model.save(&config.output_file)?; diff --git a/crates/redeem-properties/src/models/ccs_model.rs b/crates/redeem-properties/src/models/ccs_model.rs index a61b2e3..9e9d129 100644 --- a/crates/redeem-properties/src/models/ccs_model.rs +++ b/crates/redeem-properties/src/models/ccs_model.rs @@ -95,6 +95,9 @@ impl CCSModelWrapper { learning_rate, epochs, early_stopping_patience, + "training", + true, + true, ) } diff --git a/crates/redeem-properties/src/models/model_interface.rs b/crates/redeem-properties/src/models/model_interface.rs index ff83f6b..d4b7194 100644 --- a/crates/redeem-properties/src/models/model_interface.rs +++ b/crates/redeem-properties/src/models/model_interface.rs @@ -465,6 +465,8 @@ pub trait ModelInterface: Send + Sync + ModelClone { /// This method initializes model weights from scratch and trains over the given peptide feature data for a specified /// number of epochs. Optionally performs validation and tracks both training and validation loss statistics. /// Early stopping is applied if the validation loss does not improve for a consecutive number of epochs. + /// + /// A Cosine Annealing with Warmup learning rate scheduler is used to adjust the learning rate during training. The initial warmup period is set to 10% of the total training steps. /// /// # Arguments /// * `training_data` - Vector of peptide records used for training. @@ -475,14 +477,12 @@ pub trait ModelInterface: Send + Sync + ModelClone { /// * `learning_rate` - Learning rate for the AdamW optimizer. /// * `epochs` - Maximum number of training epochs. /// * `early_stopping_patience` - Number of epochs to wait before stopping if validation loss does not improve. + /// * `context` - A string representing the context for logging, e.g., "training" or "fine-tuning". + /// * `save_checkpoints` - Flag to save model checkpoints during training. + /// * `track_metrics` - Flag to track training and validation metrics. /// /// # Returns - /// A `Vec` of tuples where each tuple contains: - /// * `epoch` - Epoch number. - /// * `avg_train_loss` - Average training loss for the epoch. - /// * `avg_val_loss` - Optional average validation loss for the epoch. - /// * `train_std` - Standard deviation of training loss across batches. - /// * `val_std` - Optional standard deviation of validation loss across batches. + /// [`TrainingStepMetrics`] - A struct containing training and validation loss statistics, learning rates, and other metrics. fn train( &mut self, training_data: &Vec, @@ -496,13 +496,17 @@ pub trait ModelInterface: Send + Sync + ModelClone { learning_rate: f64, epochs: usize, early_stopping_patience: usize, + context: &str, + save_checkpoints: bool, + track_metrics: bool, ) -> Result { let num_batches = (training_data.len() + batch_size - 1) / batch_size; let total_steps = num_batches * epochs; - let warmup_steps = total_steps / 10; // 10% of total steps + let warmup_steps = total_steps / 10; info!( - "Training {} model from on {} peptide features ({} batches) for {} epochs", + "{} {} model on {} peptide features ({} batches) for {} epochs", + context, self.get_model_arch(), training_data.len(), num_batches, @@ -540,7 +544,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { let mut epoch_losses = vec![]; for epoch in 0..epochs { - let progress = Progress::new(num_batches, &format!("[training] Epoch {}: ", epoch)); + let progress = Progress::new(num_batches, &format!("[{}] Epoch {}: ", context, epoch)); let mut batch_losses = vec![]; training_data.chunks(batch_size).enumerate().try_for_each( @@ -571,21 +575,24 @@ pub trait ModelInterface: Send + Sync + ModelClone { _ => None, }; - step_metrics.epochs.push(epoch); - step_metrics.steps.push(step_idx); - step_metrics - .learning_rates - .push(lr_scheduler.get_last_lr() as f64); - step_metrics.losses.push(loss_val); - step_metrics.phases.push(TrainingPhase::Train); - step_metrics.accuracies.push(acc); - step_metrics.precisions.push(None); - step_metrics.recalls.push(None); - step_idx += 1; + if track_metrics{ + step_metrics.epochs.push(epoch); + step_metrics.steps.push(step_idx); + step_metrics + .learning_rates + .push(lr_scheduler.get_last_lr() as f64); + step_metrics.losses.push(loss_val); + step_metrics.phases.push(TrainingPhase::Train); + step_metrics.accuracies.push(acc); + step_metrics.precisions.push(None); + step_metrics.recalls.push(None); + step_idx += 1; + } + progress.update_description(&format!( - "[training] Epoch {}: Loss: {:.4}", - epoch, loss_val + "[{}] Epoch {}: Loss: {:.4}", + context, epoch, loss_val )); progress.inc(); @@ -633,17 +640,19 @@ pub trait ModelInterface: Send + Sync + ModelClone { }) .collect::>()?; - for (val_loss, idx, lr, acc) in &val_results { - step_metrics.epochs.push(epoch); - step_metrics.steps.push(val_step_idx + idx); - step_metrics.learning_rates.push(*lr); - step_metrics.losses.push(*val_loss); - step_metrics.phases.push(TrainingPhase::Validation); - step_metrics.accuracies.push(*acc); - step_metrics.precisions.push(None); - step_metrics.recalls.push(None); + if track_metrics{ + for (val_loss, idx, lr, acc) in &val_results { + step_metrics.epochs.push(epoch); + step_metrics.steps.push(val_step_idx + idx); + step_metrics.learning_rates.push(*lr); + step_metrics.losses.push(*val_loss); + step_metrics.phases.push(TrainingPhase::Validation); + step_metrics.accuracies.push(*acc); + step_metrics.precisions.push(None); + step_metrics.recalls.push(None); + } + val_step_idx += val_results.len(); } - val_step_idx += val_results.len(); let val_losses: Vec = val_results.iter().map(|(loss, _, _, _)| *loss).collect(); @@ -666,45 +675,18 @@ pub trait ModelInterface: Send + Sync + ModelClone { if avg_val_loss < best_val_loss { best_val_loss = avg_val_loss; epochs_without_improvement = 0; - - // Check if the prior checkpoint exists, if it does delete it - let checkpoint_path = format!( - "redeem_{}_best_val_ckpt_model_epoch_{}.safetensors", - self.get_model_arch(), - epoch - 1 - ); - if PathBuf::from(&checkpoint_path).exists() { - std::fs::remove_file(&checkpoint_path)?; + if save_checkpoints{ + self.save_epoch_checkpoint(epoch, "val")?; } - - let checkpoint_path = format!( - "redeem_{}_best_val_ckpt_model_epoch_{}.safetensors", - self.get_model_arch(), - epoch - ); - self.get_mut_varmap().save(&checkpoint_path)?; } else { epochs_without_improvement += 1; if epochs_without_improvement >= early_stopping_patience { info!("Early stopping triggered after {} epochs without validation loss improvement.", early_stopping_patience); return Ok(step_metrics); } - let checkpoint_path = format!( - "redeem_{}_ckpt_model_epoch_{}.safetensors", - self.get_model_arch(), - epoch - 1 - ); - // Check if the prior checkpoint exists, if it does delete it - if PathBuf::from(&checkpoint_path).exists() { - std::fs::remove_file(&checkpoint_path)?; + if save_checkpoints{ + self.save_epoch_checkpoint(epoch, "train")?; } - // Save the current checkpoint - let checkpoint_path = format!( - "redeem_{}_ckpt_model_epoch_{}.safetensors", - self.get_model_arch(), - epoch - ); - self.get_mut_varmap().save(&checkpoint_path)?; } } else { epoch_losses.push((epoch, avg_loss, None, std_loss, None)); @@ -713,37 +695,17 @@ pub trait ModelInterface: Send + Sync + ModelClone { epoch, avg_loss, std_loss )); progress.finish(); - - let checkpoint_path = format!( - "redeem_{}_ckpt_model_epoch_{}.safetensors", - self.get_model_arch(), - epoch - 1 - ); - // Check if the prior checkpoint exists, if it does delete it - if PathBuf::from(&checkpoint_path).exists() { - std::fs::remove_file(&checkpoint_path)?; + if save_checkpoints{ + self.save_epoch_checkpoint(epoch, "train")?; } - // Save the current checkpoint - let checkpoint_path = format!( - "redeem_{}_ckpt_model_epoch_{}.safetensors", - self.get_model_arch(), - epoch - ); - self.get_mut_varmap().save(&checkpoint_path)?; } } Ok(step_metrics) } - /// Fine-tune the model on a batch of training data. - /// - /// # Arguments - /// * `training_data` - A vector of `PeptideData` instances representing the training data. - /// * `modifications` - A map of modifications and their corresponding feature vectors. - /// * `batch_size` - The batch size to use for training. - /// * `learning_rate` - The learning rate to use for training. - /// * `epochs` - The number of epochs to train for. + /// Fine-tune the model on new data using the main [`ModelInterface::train`] method. + /// This is a wrapper that disables validation and early stopping. fn fine_tune( &mut self, training_data: &Vec, @@ -755,160 +717,21 @@ pub trait ModelInterface: Send + Sync + ModelClone { learning_rate: f64, epochs: usize, ) -> Result<()> { - // let num_batches = if training_data.len() < batch_size { - // 1 - // } else { - // let full_batches = training_data.len() / batch_size; - // if training_data.len() % batch_size > 0 { - // full_batches + 1 - // } else { - // full_batches - // } - // }; - - // info!( - // "Fine-tuning {} model on {} peptide features ({} batches) for {} epochs", - // self.get_model_arch(), - // training_data.len(), - // num_batches, - // epochs - // ); - - // let params = candle_nn::ParamsAdamW { - // lr: learning_rate, - // ..Default::default() - // }; - // let mut opt = candle_nn::AdamW::new(self.get_mut_varmap().all_vars(), params)?; - - // for epoch in 0..epochs { - // let progress = Progress::new(num_batches, &format!("[fine-tuning] Epoch {}: ", epoch)); - // let mut total_loss = 0.0; - - // for batch_idx in 0..num_batches { - // let start = batch_idx * batch_size; - // let end = (start + batch_size).min(training_data.len()); - // let batch_data = &training_data[start..end]; - - // let peptides: Vec = batch_data - // .iter() - // .map(|p| remove_mass_shift(&p.sequence)) - // .collect(); - // let mods: Vec = batch_data - // .iter() - // .map(|p| get_modification_string(&p.sequence, &modifications)) - // .collect(); - // let mod_sites: Vec = batch_data - // .iter() - // .map(|p| get_modification_indices(&p.sequence)) - // .collect(); - - // let charges = batch_data - // .iter() - // .filter_map(|p| p.charge) - // .collect::>(); - // let charges = if charges.len() == batch_data.len() { - // Some(charges) - // } else { - // None - // }; - - // let nces = batch_data.iter().filter_map(|p| p.nce).collect::>(); - // let nces = if nces.len() == batch_data.len() { - // Some(nces) - // } else { - // None - // }; - - // let instruments = batch_data - // .iter() - // .filter_map(|p| p.instrument.clone()) - // .collect::>(); - // let instruments = if instruments.len() == batch_data.len() { - // Some(instruments) - // } else { - // None - // }; - - // let input_batch = self - // .encode_peptides(&peptides, &mods, &mod_sites, charges, nces, instruments)? - // .to_device(self.get_device())?; - - // log::trace!( - // "[ModelInterface::fine_tune] input_batch shape: {:?}, device: {:?}", - // input_batch.shape(), - // input_batch.device() - // ); - - // let batch_targets = match self.property_type() { - // PropertyType::RT => PredictionResult::RTResult( - // batch_data - // .iter() - // .map(|p| p.retention_time.unwrap_or_default()) - // .collect(), - // ), - // PropertyType::CCS => PredictionResult::CCSResult( - // batch_data - // .iter() - // .map(|p| p.ion_mobility.unwrap_or_default()) - // .collect(), - // ), - // PropertyType::MS2 => PredictionResult::MS2Result( - // batch_data - // .iter() - // .map(|p| p.ms2_intensities.clone().unwrap_or_default()) - // .collect(), - // ), - // }; - - // let target_batch = match batch_targets { - // PredictionResult::RTResult(ref values) - // | PredictionResult::CCSResult(ref values) => { - // Tensor::new(values.clone(), &self.get_device())? - // } - // PredictionResult::MS2Result(ref spectra) => { - // let max_len = spectra.iter().map(|s| s.len()).max().unwrap_or(1); - // let feature_dim = spectra - // .get(0) - // .and_then(|s| s.get(0)) - // .map(|v| v.len()) - // .unwrap_or(1); - // let mut padded_spectra = spectra.clone(); - // for s in &mut padded_spectra { - // s.resize(max_len, vec![0.0; feature_dim]); - // } - // Tensor::new(padded_spectra.concat(), &self.get_device())?.reshape(( - // batch_data.len(), - // max_len, - // feature_dim, - // ))? - // } - // } - // .to_device(self.get_device())?; - - // let predicted = self.forward(&input_batch)?; - // let loss = candle_nn::loss::mse(&predicted, &target_batch)?; - // opt.backward_step(&loss)?; - - // total_loss += loss.to_vec0::().unwrap_or(990.0); - - // progress.update_description(&format!( - // "[fine-tuning] Epoch {}: Loss: {}", - // epoch, - // loss.to_vec0::()? - // )); - // progress.inc(); - // } - - // let avg_loss = total_loss / num_batches as f32; - // progress.update_description(&format!( - // "[fine-tuning] Epoch {}: Avg. Batch Loss: {}", - // epoch, avg_loss - // )); - // progress.finish(); - // } - - // Ok(()) - todo!() + let _metrics = self.train( + training_data, + None, // No validation data + modifications, + batch_size, + batch_size, // Validation batch size is same but unused + learning_rate, + epochs, + usize::MAX, // Disable early stopping + "fine-tuning", + false, // No checkpoints + false, // No metrics + )?; + + Ok(()) } /// Perform inference over a batch of peptides. @@ -1055,23 +878,47 @@ pub trait ModelInterface: Send + Sync + ModelClone { .encode_peptides(naked_sequences, mods, mod_sites, charges, nces, instruments)? .to_device(self.get_device())?; - let target_values: Vec = match self.property_type() { - PropertyType::RT => batch - .retention_times - .iter() - .map(|v| v.unwrap_or(0.0)) - .collect(), - PropertyType::CCS => batch - .ccs - .iter() - .map(|v| v.unwrap_or(0.0)) - .collect(), + let target_tensor = match self.property_type() { + PropertyType::RT => { + let target_values: Vec = batch + .retention_times + .iter() + .map(|v| v.unwrap_or(0.0)) + .collect(); + Tensor::new(target_values, &self.get_device())? + } + PropertyType::CCS => { + let target_values: Vec = batch + .ccs + .iter() + .map(|v| v.unwrap_or(0.0)) + .collect(); + Tensor::new(target_values, &self.get_device())? + } PropertyType::MS2 => { - return Err(anyhow::anyhow!("MS2 training is not yet implemented")) + let mut targets = Vec::new(); + for (i, opt_peptide) in batch.ms2_intensities.iter().enumerate() { + let peptide = opt_peptide.as_ref().ok_or_else(|| { + anyhow::anyhow!("Missing MS2 intensities for peptide at index {i}") + })?; + for row in peptide { + for val in row { + targets.push(*val); + } + } + } + let shape = ( + batch.ms2_intensities.len(), + batch.ms2_intensities[0] + .as_ref() + .ok_or_else(|| anyhow::anyhow!("Missing MS2 intensities in batch"))? + .len(), + 8, + ); + Tensor::from_vec(targets, shape, &self.get_device())? } }; - let target_tensor = Tensor::new(target_values, &self.get_device())?; Ok((input_batch, target_tensor)) } @@ -1111,6 +958,35 @@ pub trait ModelInterface: Send + Sync + ModelClone { Ok(()) } + /// Save epoch checkpoint and delete prior checkpoint + fn save_epoch_checkpoint(&mut self, epoch: usize, ctx: &str) -> Result<()> { + let insert_ctx = match ctx { + "train" => "_", + "val" => "_best_val_", + _ => panic!("Invalid context for saving checkpoint. Must be 'train' or 'val'."), + }; + + // Check if the prior checkpoint exists, if it does delete it + let checkpoint_path = format!( + "redeem_{}{}ckpt_model_epoch_{}.safetensors", + self.get_model_arch(), + insert_ctx, + epoch - 1 + ); + if PathBuf::from(&checkpoint_path).exists() { + std::fs::remove_file(&checkpoint_path)?; + } + // Save the current checkpoint + let checkpoint_path = format!( + "redeem_{}{}ckpt_model_epoch_{}.safetensors", + self.get_model_arch(), + insert_ctx, + epoch + ); + self.get_mut_varmap().save(&checkpoint_path)?; + Ok(()) + } + fn apply_min_pred_value(&self, tensor: &Tensor, min_pred_value: f32) -> Result { // Create a tensor with the same shape as the input, filled with min_pred_value let min_tensor = Tensor::full(min_pred_value, tensor.shape(), tensor.device())?; diff --git a/crates/redeem-properties/src/models/rt_model.rs b/crates/redeem-properties/src/models/rt_model.rs index 7b2a166..8143b92 100644 --- a/crates/redeem-properties/src/models/rt_model.rs +++ b/crates/redeem-properties/src/models/rt_model.rs @@ -98,6 +98,9 @@ impl RTModelWrapper { learning_rate, epochs, early_stopping_patience, + "training", + true, + true, ) } From 7ae4aa87ed5b19bdf333b4b2bffd345595893ab6 Mon Sep 17 00:00:00 2001 From: singjc Date: Thu, 15 May 2025 01:42:52 -0400 Subject: [PATCH 60/75] refactor: Update AAEmbedding constructor signature to accept VarBuilder instead of Device --- .../redeem-properties/src/building_blocks/building_blocks.rs | 5 +---- crates/redeem-properties/src/models/ms2_bert_model.rs | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/crates/redeem-properties/src/building_blocks/building_blocks.rs b/crates/redeem-properties/src/building_blocks/building_blocks.rs index 2e758fd..d99c914 100644 --- a/crates/redeem-properties/src/building_blocks/building_blocks.rs +++ b/crates/redeem-properties/src/building_blocks/building_blocks.rs @@ -97,10 +97,7 @@ struct AAEmbedding { } impl AAEmbedding { - fn new(hidden_size: usize, device: &Device) -> Result { - // Create a VarBuilder - let vb = nn::VarBuilder::zeros(DType::F32, device); - + fn new(hidden_size: usize, vb: &nn::VarBuilder) -> Result { // Create the embedding layer let embeddings = nn::embedding(AA_EMBEDDING_SIZE, hidden_size, vb.pp("embedding"))?; diff --git a/crates/redeem-properties/src/models/ms2_bert_model.rs b/crates/redeem-properties/src/models/ms2_bert_model.rs index 9bef37c..85a9ef6 100644 --- a/crates/redeem-properties/src/models/ms2_bert_model.rs +++ b/crates/redeem-properties/src/models/ms2_bert_model.rs @@ -228,7 +228,7 @@ impl ModelInterface for MS2BertModel { // Forward pass through input_nn with dropout let in_x = self .dropout - .forward(&self.input_nn.forward(&aa_indices_out, &mod_x_out)?, true)?; + .forward(&self.input_nn.forward(&aa_indices_out, &mod_x_out)?, self.is_training)?; log::trace!( "[MS2BertModel::forward] in_x shape (post dropout-input_nn): {:?}, device: {:?}", From c1d18d8c9f4d5e0d2267ec3f1eb4f04b7b556ade Mon Sep 17 00:00:00 2001 From: singjc Date: Thu, 15 May 2025 08:39:48 -0400 Subject: [PATCH 61/75] refactor: Update semi-supervised learning to return updated ranks along with predictions --- .../examples/gbdt_semi_supervised_learning.rs | 2 +- crates/redeem-classifiers/src/data_handling.rs | 18 ++++++++++++++++++ crates/redeem-classifiers/src/psm_scorer.rs | 12 +++++++----- 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/crates/redeem-classifiers/examples/gbdt_semi_supervised_learning.rs b/crates/redeem-classifiers/examples/gbdt_semi_supervised_learning.rs index e0debf9..c7253c7 100644 --- a/crates/redeem-classifiers/examples/gbdt_semi_supervised_learning.rs +++ b/crates/redeem-classifiers/examples/gbdt_semi_supervised_learning.rs @@ -122,7 +122,7 @@ fn main() -> Result<()> { 3, Some((0.15, 1.0)) ); - let predictions = learner.fit(x, y.clone(), metadata); + let (predictions, _ranks) = learner.fit(x, y.clone(), metadata)?; println!("Labels: {:?}", y); diff --git a/crates/redeem-classifiers/src/data_handling.rs b/crates/redeem-classifiers/src/data_handling.rs index b28e9a5..354b6c8 100644 --- a/crates/redeem-classifiers/src/data_handling.rs +++ b/crates/redeem-classifiers/src/data_handling.rs @@ -143,6 +143,24 @@ impl Experiment { ); } + /// Extracts the "rank" feature column as a 1D array. + /// + /// # Returns + /// * `Ok(Array1)` containing the rank values (one per row in `x`) + /// * `Err` if "rank" is not found in the feature names + pub fn get_rank_column(&self) -> anyhow::Result> { + let Some(rank_idx) = self + .psm_metadata + .feature_names + .iter() + .position(|name| name == "rank") + else { + anyhow::bail!("'rank' feature not found in feature_names"); + }; + + Ok(self.x.column(rank_idx).to_owned()) + } + pub fn get_top_test_peaks(&self) -> Experiment { let mask = &self.is_train.mapv(|x| !x) & &self.is_top_peak; diff --git a/crates/redeem-classifiers/src/psm_scorer.rs b/crates/redeem-classifiers/src/psm_scorer.rs index f4bd9ba..04c7626 100644 --- a/crates/redeem-classifiers/src/psm_scorer.rs +++ b/crates/redeem-classifiers/src/psm_scorer.rs @@ -321,7 +321,7 @@ impl SemiSupervisedLearner { /// # Returns /// /// The predictions for the input features - pub fn fit(&mut self, x: Array2, y: Array1, psm_metadata: PsmMetadata) -> Array1 { + pub fn fit(&mut self, x: Array2, y: Array1, psm_metadata: PsmMetadata) -> anyhow::Result<(Array1, Array1)> { let mut experiment = Experiment::new(x.clone(), y.clone(), psm_metadata.clone()); @@ -331,9 +331,7 @@ impl SemiSupervisedLearner { let (_best_feat, _best_positives, mut new_labels, best_desc, _best_feature_scores) = self.init_best_feature(&experiment, self.train_fdr); - // println!("Original labels: {:?}", experiment.y); experiment.y = new_labels.clone(); - // println!("New labels: {:?}", experiment.y); let folds = self.create_folds(&experiment, self.xeval_num_iter, self.class_pct.map(|(t, _d)| t), self.class_pct.map(|(_t, d)| d)); @@ -380,11 +378,15 @@ impl SemiSupervisedLearner { // Final prediction on the entire dataset log::info!("Final prediction on the entire dataset"); - let experiment = Experiment::new(x, y, psm_metadata); + let mut experiment = Experiment::new(x, y, psm_metadata); // self.model // .fit(&experiment.x, &experiment.y.to_vec(), None, None); - Array1::from(self.model.predict_proba(&experiment.x)) + let final_predictions = Array1::from(self.model.predict_proba(&experiment.x)); + experiment.update_rank_feature(&final_predictions, &experiment.psm_metadata.clone()); + let updated_ranks = experiment.get_rank_column()?; + + Ok((final_predictions, updated_ranks)) } } From 60886782074a67b23c3ffe730b24d0ecb9148cf3 Mon Sep 17 00:00:00 2001 From: singjc Date: Thu, 15 May 2025 08:40:00 -0400 Subject: [PATCH 62/75] refactor: Update Redeem CLI to use RTCNNTFModel for inference --- crates/redeem-cli/src/properties/inference/inference.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crates/redeem-cli/src/properties/inference/inference.rs b/crates/redeem-cli/src/properties/inference/inference.rs index b171f77..7445b72 100644 --- a/crates/redeem-cli/src/properties/inference/inference.rs +++ b/crates/redeem-cli/src/properties/inference/inference.rs @@ -4,6 +4,7 @@ use redeem_properties::models::ccs_cnn_lstm_model::CCSCNNLSTMModel; use redeem_properties::models::ccs_cnn_tf_model::CCSCNNTFModel; use redeem_properties::models::model_interface::ModelInterface; use redeem_properties::models::rt_cnn_lstm_model::RTCNNLSTMModel; +use redeem_properties::models::rt_cnn_transformer_model::RTCNNTFModel; use redeem_properties::utils::data_handling::{PeptideData, TargetNormalization}; use redeem_properties::utils::peptdeep_utils::{load_modifications, MODIFICATION_MAP}; use redeem_properties::utils::utils::get_device; @@ -46,7 +47,7 @@ pub fn run_inference(config: &PropertyInferenceConfig) -> Result<()> { true, device.clone(), )?), - "rt_cnn_tf" => Box::new(RTCNNLSTMModel::new( + "rt_cnn_tf" => Box::new(RTCNNTFModel::new( &config.model_path, None, 0, From 2fbe12ce1bf368d757243284c8aff56376201416 Mon Sep 17 00:00:00 2001 From: singjc Date: Thu, 15 May 2025 10:03:34 -0400 Subject: [PATCH 63/75] refactor: Update data handling to extract "rank" feature column as 1D array of `u32`s --- .../redeem-classifiers/src/data_handling.rs | 21 +++++++++++++++---- crates/redeem-classifiers/src/psm_scorer.rs | 2 +- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/crates/redeem-classifiers/src/data_handling.rs b/crates/redeem-classifiers/src/data_handling.rs index 354b6c8..f94de6b 100644 --- a/crates/redeem-classifiers/src/data_handling.rs +++ b/crates/redeem-classifiers/src/data_handling.rs @@ -143,12 +143,12 @@ impl Experiment { ); } - /// Extracts the "rank" feature column as a 1D array. + /// Extracts the "rank" feature column as a 1D array of `u32`s. /// /// # Returns - /// * `Ok(Array1)` containing the rank values (one per row in `x`) + /// * `Ok(Array1)` containing the rank values (one per row in `x`) /// * `Err` if "rank" is not found in the feature names - pub fn get_rank_column(&self) -> anyhow::Result> { + pub fn get_rank_column(&self) -> anyhow::Result> { let Some(rank_idx) = self .psm_metadata .feature_names @@ -158,7 +158,20 @@ impl Experiment { anyhow::bail!("'rank' feature not found in feature_names"); }; - Ok(self.x.column(rank_idx).to_owned()) + let rank_f32 = self.x.column(rank_idx); + + let rank_u32 = rank_f32 + .iter() + .map(|&val| { + if val.is_finite() && val >= 0.0 { + val.round() as u32 + } else { + 0 // fallback: treat NaNs or negatives as rank 0 (could also bail or panic if preferred) + } + }) + .collect::>(); + + Ok(rank_u32) } diff --git a/crates/redeem-classifiers/src/psm_scorer.rs b/crates/redeem-classifiers/src/psm_scorer.rs index 04c7626..2280a23 100644 --- a/crates/redeem-classifiers/src/psm_scorer.rs +++ b/crates/redeem-classifiers/src/psm_scorer.rs @@ -321,7 +321,7 @@ impl SemiSupervisedLearner { /// # Returns /// /// The predictions for the input features - pub fn fit(&mut self, x: Array2, y: Array1, psm_metadata: PsmMetadata) -> anyhow::Result<(Array1, Array1)> { + pub fn fit(&mut self, x: Array2, y: Array1, psm_metadata: PsmMetadata) -> anyhow::Result<(Array1, Array1)> { let mut experiment = Experiment::new(x.clone(), y.clone(), psm_metadata.clone()); From fd447f3d364dce02e1b512873175377572032cec Mon Sep 17 00:00:00 2001 From: singjc Date: Thu, 15 May 2025 18:00:40 -0400 Subject: [PATCH 64/75] refactor: Improve bidirectional LSTM input handling for contiguous tensors --- crates/redeem-properties/src/building_blocks/bilstm.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/crates/redeem-properties/src/building_blocks/bilstm.rs b/crates/redeem-properties/src/building_blocks/bilstm.rs index 43d4e1a..dca552a 100644 --- a/crates/redeem-properties/src/building_blocks/bilstm.rs +++ b/crates/redeem-properties/src/building_blocks/bilstm.rs @@ -110,8 +110,11 @@ impl BidirectionalLSTM { let h0_2 = h0.narrow(0, 2, 2)?; let c0_2 = c0.narrow(0, 2, 2)?; - let (out1, (hn1, cn1)) = self.apply_bidirectional_layer(xs, &self.forward_lstm1, &self.backward_lstm1, &h0_1, &c0_1)?; + let xs = xs.contiguous()?; + let (out1, (hn1, cn1)) = self.apply_bidirectional_layer(&xs, &self.forward_lstm1, &self.backward_lstm1, &h0_1, &c0_1)?; + + let out1 = out1.contiguous()?; let (out2, (hn2, cn2)) = self.apply_bidirectional_layer(&out1, &self.forward_lstm2, &self.backward_lstm2, &h0_2, &c0_2)?; let hn = Tensor::cat(&[hn1, hn2], 0)?; From 2e3bd90ba4c1e530b93905b7955af8be8325ae97 Mon Sep 17 00:00:00 2001 From: singjc Date: Thu, 15 May 2025 18:45:20 -0400 Subject: [PATCH 65/75] refactor: Improve bidirectional LSTM input handling for contiguous tensors --- crates/redeem-properties/src/building_blocks/bilstm.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/crates/redeem-properties/src/building_blocks/bilstm.rs b/crates/redeem-properties/src/building_blocks/bilstm.rs index dca552a..bfb11f7 100644 --- a/crates/redeem-properties/src/building_blocks/bilstm.rs +++ b/crates/redeem-properties/src/building_blocks/bilstm.rs @@ -58,8 +58,9 @@ impl BidirectionalLSTM { let h0_forward = h0.i(0)?; let c0_forward = c0.i(0)?; let state_fw = rnn::LSTMState { h: h0_forward, c: c0_forward }; - - let out_fw_states = lstm_forward.seq_init(input, &state_fw)?; + + let input = input.contiguous()?; + let out_fw_states = lstm_forward.seq_init(&input, &state_fw)?; let out_fw = Tensor::stack( &out_fw_states.iter().map(|s| s.h()).collect::>(), 1, @@ -74,7 +75,7 @@ impl BidirectionalLSTM { .map(|t| input.i((.., t..=t, ..))) .collect::>>()?, 1, - )?; + )?.contiguous()?; // Initial states for backward let h0_backward = h0.i(1)?; From 930f21a5b119e91b209e44d2c54e3d1f8a7d9044 Mon Sep 17 00:00:00 2001 From: singjc Date: Thu, 15 May 2025 19:05:22 -0400 Subject: [PATCH 66/75] refactor: Improve initialization of hidden states in BidirectionalLSTM --- crates/redeem-properties/src/building_blocks/bilstm.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/crates/redeem-properties/src/building_blocks/bilstm.rs b/crates/redeem-properties/src/building_blocks/bilstm.rs index bfb11f7..c39235f 100644 --- a/crates/redeem-properties/src/building_blocks/bilstm.rs +++ b/crates/redeem-properties/src/building_blocks/bilstm.rs @@ -103,8 +103,9 @@ impl BidirectionalLSTM { /// Forward with hidden states returned pub fn forward_with_state(&self, xs: &Tensor) -> Result<(Tensor, (Tensor, Tensor))> { let (batch_size, _, _) = xs.dims3()?; - let h0 = self.h0.expand((self.num_layers * 2, batch_size, self.hidden_size))?; - let c0 = self.c0.expand((self.num_layers * 2, batch_size, self.hidden_size))?; + let h0 = self.h0.unsqueeze(1)?.repeat((1, batch_size, 1))?; + let c0 = self.c0.unsqueeze(1)?.repeat((1, batch_size, 1))?; + let h0_1 = h0.narrow(0, 0, 2)?; let c0_1 = c0.narrow(0, 0, 2)?; From d8a32c54d9d4c64bf42fa033c587a00b5e351b5f Mon Sep 17 00:00:00 2001 From: singjc Date: Thu, 15 May 2025 19:25:41 -0400 Subject: [PATCH 67/75] debug: bilstm forward with state --- .../src/building_blocks/bilstm.rs | 51 +++++++++++-------- 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/crates/redeem-properties/src/building_blocks/bilstm.rs b/crates/redeem-properties/src/building_blocks/bilstm.rs index c39235f..d4c52ad 100644 --- a/crates/redeem-properties/src/building_blocks/bilstm.rs +++ b/crates/redeem-properties/src/building_blocks/bilstm.rs @@ -54,17 +54,20 @@ impl BidirectionalLSTM { ) -> Result<(Tensor, (Tensor, Tensor))> { let (_batch_size, seq_len, _input_size) = input.dims3()?; + log::debug!("Entering apply_bidirectional_layer"); + // Initial states for forward let h0_forward = h0.i(0)?; let c0_forward = c0.i(0)?; - let state_fw = rnn::LSTMState { h: h0_forward, c: c0_forward }; - + let state_fw = rnn::LSTMState { h: h0_forward.clone(), c: c0_forward.clone() }; + let input = input.contiguous()?; + log::debug!("Forward input shape: {:?}, is_contiguous: {}", input.shape(), input.is_contiguous()); + let out_fw_states = lstm_forward.seq_init(&input, &state_fw)?; - let out_fw = Tensor::stack( - &out_fw_states.iter().map(|s| s.h()).collect::>(), - 1, - )?; + let out_fw = Tensor::stack(&out_fw_states.iter().map(|s| s.h()).collect::>(), 1)?; + log::debug!("out_fw shape: {:?}, is_contiguous: {}", out_fw.shape(), out_fw.is_contiguous()); + let last_fw_h = out_fw_states.last().unwrap().h().clone(); let last_fw_c = out_fw_states.last().unwrap().c().clone(); @@ -76,17 +79,17 @@ impl BidirectionalLSTM { .collect::>>()?, 1, )?.contiguous()?; - + log::debug!("Backward input_reversed shape: {:?}, is_contiguous: {}", input_reversed.shape(), input_reversed.is_contiguous()); + // Initial states for backward let h0_backward = h0.i(1)?; let c0_backward = c0.i(1)?; - let state_bw = rnn::LSTMState { h: h0_backward, c: c0_backward }; + let state_bw = rnn::LSTMState { h: h0_backward.clone(), c: c0_backward.clone() }; let out_bw_states = lstm_backward.seq_init(&input_reversed, &state_bw)?; - let out_bw = Tensor::stack( - &out_bw_states.iter().map(|s| s.h()).collect::>(), - 1, - )?; + let out_bw = Tensor::stack(&out_bw_states.iter().map(|s| s.h()).collect::>(), 1)?; + log::debug!("out_bw shape: {:?}, is_contiguous: {}", out_bw.shape(), out_bw.is_contiguous()); + let last_bw_h = out_bw_states.last().unwrap().h().clone(); let last_bw_c = out_bw_states.last().unwrap().c().clone(); @@ -94,31 +97,35 @@ impl BidirectionalLSTM { let hn = Tensor::stack(&[last_fw_h.clone(), last_bw_h.clone()], 0)?; let cn = Tensor::stack(&[last_fw_c, last_bw_c], 0)?; let output = Tensor::cat(&[out_fw, out_bw], 2)?; + log::debug!("Combined output shape: {:?}, is_contiguous: {}", output.shape(), output.is_contiguous()); Ok((output, (hn, cn))) } - - + /// Forward with hidden states returned pub fn forward_with_state(&self, xs: &Tensor) -> Result<(Tensor, (Tensor, Tensor))> { + log::debug!("Input xs shape: {:?}, is_contiguous: {}", xs.shape(), xs.is_contiguous()); + let (batch_size, _, _) = xs.dims3()?; - let h0 = self.h0.unsqueeze(1)?.repeat((1, batch_size, 1))?; - let c0 = self.c0.unsqueeze(1)?.repeat((1, batch_size, 1))?; - - + let h0 = self.h0.expand((self.num_layers * 2, batch_size, self.hidden_size))?; + let c0 = self.c0.expand((self.num_layers * 2, batch_size, self.hidden_size))?; + let h0_1 = h0.narrow(0, 0, 2)?; let c0_1 = c0.narrow(0, 0, 2)?; let h0_2 = h0.narrow(0, 2, 2)?; let c0_2 = c0.narrow(0, 2, 2)?; - + let xs = xs.contiguous()?; - + log::debug!("xs after contiguous shape: {:?}, is_contiguous: {}", xs.shape(), xs.is_contiguous()); + let (out1, (hn1, cn1)) = self.apply_bidirectional_layer(&xs, &self.forward_lstm1, &self.backward_lstm1, &h0_1, &c0_1)?; - + let out1 = out1.contiguous()?; + log::debug!("out1 after first layer shape: {:?}, is_contiguous: {}", out1.shape(), out1.is_contiguous()); + let (out2, (hn2, cn2)) = self.apply_bidirectional_layer(&out1, &self.forward_lstm2, &self.backward_lstm2, &h0_2, &c0_2)?; - + let hn = Tensor::cat(&[hn1, hn2], 0)?; let cn = Tensor::cat(&[cn1, cn2], 0)?; Ok((out2, (hn, cn))) From 1f779fdd0ef885bba52921b1cb5c3651dcd3ba0a Mon Sep 17 00:00:00 2001 From: singjc Date: Thu, 15 May 2025 19:41:16 -0400 Subject: [PATCH 68/75] refactor: Improve bidirectional LSTM forward and backward processing --- .../src/building_blocks/bilstm.rs | 45 ++++++++----------- 1 file changed, 19 insertions(+), 26 deletions(-) diff --git a/crates/redeem-properties/src/building_blocks/bilstm.rs b/crates/redeem-properties/src/building_blocks/bilstm.rs index d4c52ad..cf2eb71 100644 --- a/crates/redeem-properties/src/building_blocks/bilstm.rs +++ b/crates/redeem-properties/src/building_blocks/bilstm.rs @@ -56,51 +56,44 @@ impl BidirectionalLSTM { log::debug!("Entering apply_bidirectional_layer"); - // Initial states for forward + // Forward let h0_forward = h0.i(0)?; let c0_forward = c0.i(0)?; - let state_fw = rnn::LSTMState { h: h0_forward.clone(), c: c0_forward.clone() }; + let mut state_fw = rnn::LSTMState { h: h0_forward, c: c0_forward }; - let input = input.contiguous()?; - log::debug!("Forward input shape: {:?}, is_contiguous: {}", input.shape(), input.is_contiguous()); - - let out_fw_states = lstm_forward.seq_init(&input, &state_fw)?; + let mut out_fw_states = Vec::with_capacity(seq_len); + for t in 0..seq_len { + let xt = input.i((.., t..=t, ..))?.squeeze(1)?.contiguous()?; + state_fw = lstm_forward.step(&xt, &state_fw)?; + out_fw_states.push(state_fw.clone()); + } let out_fw = Tensor::stack(&out_fw_states.iter().map(|s| s.h()).collect::>(), 1)?; - log::debug!("out_fw shape: {:?}, is_contiguous: {}", out_fw.shape(), out_fw.is_contiguous()); - let last_fw_h = out_fw_states.last().unwrap().h().clone(); let last_fw_c = out_fw_states.last().unwrap().c().clone(); - // Reverse sequence - let input_reversed = Tensor::cat( - &(0..seq_len) - .rev() - .map(|t| input.i((.., t..=t, ..))) - .collect::>>()?, - 1, - )?.contiguous()?; - log::debug!("Backward input_reversed shape: {:?}, is_contiguous: {}", input_reversed.shape(), input_reversed.is_contiguous()); - - // Initial states for backward + // Backward let h0_backward = h0.i(1)?; let c0_backward = c0.i(1)?; - let state_bw = rnn::LSTMState { h: h0_backward.clone(), c: c0_backward.clone() }; + let mut state_bw = rnn::LSTMState { h: h0_backward, c: c0_backward }; - let out_bw_states = lstm_backward.seq_init(&input_reversed, &state_bw)?; + let mut out_bw_states = Vec::with_capacity(seq_len); + for t in (0..seq_len).rev() { + let xt = input.i((.., t..=t, ..))?.squeeze(1)?.contiguous()?; + state_bw = lstm_backward.step(&xt, &state_bw)?; + out_bw_states.push(state_bw.clone()); + } + out_bw_states.reverse(); let out_bw = Tensor::stack(&out_bw_states.iter().map(|s| s.h()).collect::>(), 1)?; - log::debug!("out_bw shape: {:?}, is_contiguous: {}", out_bw.shape(), out_bw.is_contiguous()); - let last_bw_h = out_bw_states.last().unwrap().h().clone(); let last_bw_c = out_bw_states.last().unwrap().c().clone(); - // Combine hidden and cell states - let hn = Tensor::stack(&[last_fw_h.clone(), last_bw_h.clone()], 0)?; + let hn = Tensor::stack(&[last_fw_h, last_bw_h], 0)?; let cn = Tensor::stack(&[last_fw_c, last_bw_c], 0)?; let output = Tensor::cat(&[out_fw, out_bw], 2)?; - log::debug!("Combined output shape: {:?}, is_contiguous: {}", output.shape(), output.is_contiguous()); Ok((output, (hn, cn))) } + /// Forward with hidden states returned From 80469a0c33edbed5823067e468c5e0924401d68d Mon Sep 17 00:00:00 2001 From: singjc Date: Thu, 15 May 2025 19:55:07 -0400 Subject: [PATCH 69/75] debugging bilstm --- .../src/building_blocks/bilstm.rs | 21 ++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/crates/redeem-properties/src/building_blocks/bilstm.rs b/crates/redeem-properties/src/building_blocks/bilstm.rs index cf2eb71..a71fdb0 100644 --- a/crates/redeem-properties/src/building_blocks/bilstm.rs +++ b/crates/redeem-properties/src/building_blocks/bilstm.rs @@ -77,11 +77,17 @@ impl BidirectionalLSTM { let mut state_bw = rnn::LSTMState { h: h0_backward, c: c0_backward }; let mut out_bw_states = Vec::with_capacity(seq_len); - for t in (0..seq_len).rev() { + for t in 0..seq_len { let xt = input.i((.., t..=t, ..))?.squeeze(1)?.contiguous()?; - state_bw = lstm_backward.step(&xt, &state_bw)?; - out_bw_states.push(state_bw.clone()); + + log::debug!("[step][fw] xt shape: {:?}, strides: {:?}", xt.shape(), xt.stride()); + log::debug!("[step][fw] h shape: {:?}, strides: {:?}", state_fw.h.shape(), state_fw.h.stride()); + log::debug!("[step][fw] c shape: {:?}, strides: {:?}", state_fw.c.shape(), state_fw.c.stride()); + + state_fw = lstm_forward.step(&xt, &state_fw)?; + out_fw_states.push(state_fw.clone()); } + out_bw_states.reverse(); let out_bw = Tensor::stack(&out_bw_states.iter().map(|s| s.h()).collect::>(), 1)?; let last_bw_h = out_bw_states.last().unwrap().h().clone(); @@ -112,10 +118,19 @@ impl BidirectionalLSTM { let xs = xs.contiguous()?; log::debug!("xs after contiguous shape: {:?}, is_contiguous: {}", xs.shape(), xs.is_contiguous()); + log::debug!("forward_with_state: xs shape = {:?}, strides = {:?}", xs.shape(), xs.stride()); + log::debug!("h0_1 shape: {:?}, strides: {:?}", h0.shape(), h0.stride()); + log::debug!("c0_1 shape: {:?}, strides: {:?}", c0.shape(), c0.stride()); + let (out1, (hn1, cn1)) = self.apply_bidirectional_layer(&xs, &self.forward_lstm1, &self.backward_lstm1, &h0_1, &c0_1)?; let out1 = out1.contiguous()?; log::debug!("out1 after first layer shape: {:?}, is_contiguous: {}", out1.shape(), out1.is_contiguous()); + + log::debug!("forward_with_state: out1 shape = {:?}, strides = {:?}", out1.shape(), out1.stride()); + log::debug!("h0_2 shape: {:?}, strides: {:?}", h0.shape(), h0.stride()); + log::debug!("c0_2 shape: {:?}, strides: {:?}", c0.shape(), c0.stride()); + let (out2, (hn2, cn2)) = self.apply_bidirectional_layer(&out1, &self.forward_lstm2, &self.backward_lstm2, &h0_2, &c0_2)?; From 0c56fd27222c953fa1efc527c02152270181c51f Mon Sep 17 00:00:00 2001 From: singjc Date: Thu, 15 May 2025 21:18:08 -0400 Subject: [PATCH 70/75] add: type annotation for bilstm --- crates/redeem-properties/src/building_blocks/bilstm.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/redeem-properties/src/building_blocks/bilstm.rs b/crates/redeem-properties/src/building_blocks/bilstm.rs index a71fdb0..dfeddb3 100644 --- a/crates/redeem-properties/src/building_blocks/bilstm.rs +++ b/crates/redeem-properties/src/building_blocks/bilstm.rs @@ -67,7 +67,7 @@ impl BidirectionalLSTM { state_fw = lstm_forward.step(&xt, &state_fw)?; out_fw_states.push(state_fw.clone()); } - let out_fw = Tensor::stack(&out_fw_states.iter().map(|s| s.h()).collect::>(), 1)?; + let out_fw = Tensor::stack(&out_fw_states.iter().map(|s: &rnn::LSTMState| s.h()).collect::>(), 1)?; let last_fw_h = out_fw_states.last().unwrap().h().clone(); let last_fw_c = out_fw_states.last().unwrap().c().clone(); @@ -89,7 +89,7 @@ impl BidirectionalLSTM { } out_bw_states.reverse(); - let out_bw = Tensor::stack(&out_bw_states.iter().map(|s| s.h()).collect::>(), 1)?; + let out_bw = Tensor::stack(&out_bw_states.iter().map(|s: &rnn::LSTMState| s.h()).collect::>(), 1)?; let last_bw_h = out_bw_states.last().unwrap().h().clone(); let last_bw_c = out_bw_states.last().unwrap().c().clone(); From 54af169d3fb014278856d84d0796583fed5cff42 Mon Sep 17 00:00:00 2001 From: singjc Date: Thu, 15 May 2025 21:45:10 -0400 Subject: [PATCH 71/75] refactor: Clone contiguous tensor in BidirectionalLSTM for improved handling --- crates/redeem-properties/src/building_blocks/bilstm.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crates/redeem-properties/src/building_blocks/bilstm.rs b/crates/redeem-properties/src/building_blocks/bilstm.rs index dfeddb3..044750d 100644 --- a/crates/redeem-properties/src/building_blocks/bilstm.rs +++ b/crates/redeem-properties/src/building_blocks/bilstm.rs @@ -78,7 +78,9 @@ impl BidirectionalLSTM { let mut out_bw_states = Vec::with_capacity(seq_len); for t in 0..seq_len { - let xt = input.i((.., t..=t, ..))?.squeeze(1)?.contiguous()?; + let xt = input.i((.., t..=t, ..))?.squeeze(1)?.contiguous()?.clone(); + + log::debug!("xt shape: {:?}, strides: {:?}, is_contiguous: {}", xt.shape(), xt.stride(), xt.is_contiguous()); log::debug!("[step][fw] xt shape: {:?}, strides: {:?}", xt.shape(), xt.stride()); log::debug!("[step][fw] h shape: {:?}, strides: {:?}", state_fw.h.shape(), state_fw.h.stride()); From f89c0a810ad59b09fa0e9fb2d0cf839ed883de3f Mon Sep 17 00:00:00 2001 From: singjc Date: Thu, 15 May 2025 22:22:50 -0400 Subject: [PATCH 72/75] refactor: Improve logging in BidirectionalLSTM backward processing --- .../src/building_blocks/bilstm.rs | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/crates/redeem-properties/src/building_blocks/bilstm.rs b/crates/redeem-properties/src/building_blocks/bilstm.rs index 044750d..025816b 100644 --- a/crates/redeem-properties/src/building_blocks/bilstm.rs +++ b/crates/redeem-properties/src/building_blocks/bilstm.rs @@ -63,7 +63,14 @@ impl BidirectionalLSTM { let mut out_fw_states = Vec::with_capacity(seq_len); for t in 0..seq_len { - let xt = input.i((.., t..=t, ..))?.squeeze(1)?.contiguous()?; + let xt = input.i((.., t..=t, ..))?.squeeze(1)?.contiguous()?.clone(); + + log::debug!("[backward] xt shape: {:?}, strides: {:?}, is_contiguous: {}", xt.shape(), xt.stride(), xt.is_contiguous()); + + log::debug!("[backward] [step][fw] xt shape: {:?}, strides: {:?}", xt.shape(), xt.stride()); + log::debug!("[backward] [step][fw] h shape: {:?}, strides: {:?}", state_fw.h.shape(), state_fw.h.stride()); + log::debug!("[backward] [step][fw] c shape: {:?}, strides: {:?}", state_fw.c.shape(), state_fw.c.stride()); + state_fw = lstm_forward.step(&xt, &state_fw)?; out_fw_states.push(state_fw.clone()); } @@ -80,11 +87,11 @@ impl BidirectionalLSTM { for t in 0..seq_len { let xt = input.i((.., t..=t, ..))?.squeeze(1)?.contiguous()?.clone(); - log::debug!("xt shape: {:?}, strides: {:?}, is_contiguous: {}", xt.shape(), xt.stride(), xt.is_contiguous()); + log::debug!("[backward] xt shape: {:?}, strides: {:?}, is_contiguous: {}", xt.shape(), xt.stride(), xt.is_contiguous()); - log::debug!("[step][fw] xt shape: {:?}, strides: {:?}", xt.shape(), xt.stride()); - log::debug!("[step][fw] h shape: {:?}, strides: {:?}", state_fw.h.shape(), state_fw.h.stride()); - log::debug!("[step][fw] c shape: {:?}, strides: {:?}", state_fw.c.shape(), state_fw.c.stride()); + log::debug!("[backward] [step][fw] xt shape: {:?}, strides: {:?}", xt.shape(), xt.stride()); + log::debug!("[backward] [step][fw] h shape: {:?}, strides: {:?}", state_fw.h.shape(), state_fw.h.stride()); + log::debug!("[backward] [step][fw] c shape: {:?}, strides: {:?}", state_fw.c.shape(), state_fw.c.stride()); state_fw = lstm_forward.step(&xt, &state_fw)?; out_fw_states.push(state_fw.clone()); From b89e9ddadd6c6b484c471d3b113372350d145bea Mon Sep 17 00:00:00 2001 From: singjc Date: Thu, 15 May 2025 22:47:29 -0400 Subject: [PATCH 73/75] more debugging for bilstm --- .../src/building_blocks/bilstm.rs | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/crates/redeem-properties/src/building_blocks/bilstm.rs b/crates/redeem-properties/src/building_blocks/bilstm.rs index 025816b..3a6d19a 100644 --- a/crates/redeem-properties/src/building_blocks/bilstm.rs +++ b/crates/redeem-properties/src/building_blocks/bilstm.rs @@ -65,18 +65,18 @@ impl BidirectionalLSTM { for t in 0..seq_len { let xt = input.i((.., t..=t, ..))?.squeeze(1)?.contiguous()?.clone(); - log::debug!("[backward] xt shape: {:?}, strides: {:?}, is_contiguous: {}", xt.shape(), xt.stride(), xt.is_contiguous()); + log::debug!("[forward] xt shape: {:?}, strides: {:?}, is_contiguous: {}", xt.shape(), xt.stride(), xt.is_contiguous()); - log::debug!("[backward] [step][fw] xt shape: {:?}, strides: {:?}", xt.shape(), xt.stride()); - log::debug!("[backward] [step][fw] h shape: {:?}, strides: {:?}", state_fw.h.shape(), state_fw.h.stride()); - log::debug!("[backward] [step][fw] c shape: {:?}, strides: {:?}", state_fw.c.shape(), state_fw.c.stride()); - + log::debug!("[forward] [step][fw] xt shape: {:?}, strides: {:?}", xt.shape(), xt.stride()); + log::debug!("[forward] [step][fw] h shape: {:?}, strides: {:?}", state_fw.h.shape(), state_fw.h.stride()); + log::debug!("[forward] [step][fw] c shape: {:?}, strides: {:?}", state_fw.c.shape(), state_fw.c.stride()); + state_fw = lstm_forward.step(&xt, &state_fw)?; out_fw_states.push(state_fw.clone()); } - let out_fw = Tensor::stack(&out_fw_states.iter().map(|s: &rnn::LSTMState| s.h()).collect::>(), 1)?; - let last_fw_h = out_fw_states.last().unwrap().h().clone(); - let last_fw_c = out_fw_states.last().unwrap().c().clone(); + let out_fw = Tensor::stack(&out_fw_states.iter().map(|s: &rnn::LSTMState| s.h()).collect::>(), 1)?.contiguous()?; + let last_fw_h = out_fw_states.last().unwrap().h().clone().contiguous()?; + let last_fw_c = out_fw_states.last().unwrap().c().clone().contiguous()?; // Backward let h0_backward = h0.i(1)?; @@ -90,17 +90,17 @@ impl BidirectionalLSTM { log::debug!("[backward] xt shape: {:?}, strides: {:?}, is_contiguous: {}", xt.shape(), xt.stride(), xt.is_contiguous()); log::debug!("[backward] [step][fw] xt shape: {:?}, strides: {:?}", xt.shape(), xt.stride()); - log::debug!("[backward] [step][fw] h shape: {:?}, strides: {:?}", state_fw.h.shape(), state_fw.h.stride()); - log::debug!("[backward] [step][fw] c shape: {:?}, strides: {:?}", state_fw.c.shape(), state_fw.c.stride()); + log::debug!("[backward] [step][fw] h shape: {:?}, strides: {:?}", state_bw.h.shape(), state_bw.h.stride()); + log::debug!("[backward] [step][fw] c shape: {:?}, strides: {:?}", state_bw.c.shape(), state_bw.c.stride()); - state_fw = lstm_forward.step(&xt, &state_fw)?; - out_fw_states.push(state_fw.clone()); + state_bw = lstm_backward.step(&xt, &state_bw)?; + out_bw_states.push(state_bw.clone()); } out_bw_states.reverse(); - let out_bw = Tensor::stack(&out_bw_states.iter().map(|s: &rnn::LSTMState| s.h()).collect::>(), 1)?; - let last_bw_h = out_bw_states.last().unwrap().h().clone(); - let last_bw_c = out_bw_states.last().unwrap().c().clone(); + let out_bw = Tensor::stack(&out_bw_states.iter().map(|s: &rnn::LSTMState| s.h()).collect::>(), 1)?.contiguous()?; + let last_bw_h = out_bw_states.last().unwrap().h().clone().contiguous()?; + let last_bw_c = out_bw_states.last().unwrap().c().clone().contiguous()?; let hn = Tensor::stack(&[last_fw_h, last_bw_h], 0)?; let cn = Tensor::stack(&[last_fw_c, last_bw_c], 0)?; From 74b070303f31e6086ee536797bf9a7115eddb468 Mon Sep 17 00:00:00 2001 From: singjc Date: Thu, 15 May 2025 23:00:19 -0400 Subject: [PATCH 74/75] revert: apply_bidirectional_layber and forward_wtih_state in bilstm to earlier version before refatoring --- .../src/building_blocks/bilstm.rs | 142 ++++++++---------- 1 file changed, 59 insertions(+), 83 deletions(-) diff --git a/crates/redeem-properties/src/building_blocks/bilstm.rs b/crates/redeem-properties/src/building_blocks/bilstm.rs index 3a6d19a..805bc5d 100644 --- a/crates/redeem-properties/src/building_blocks/bilstm.rs +++ b/crates/redeem-properties/src/building_blocks/bilstm.rs @@ -1,4 +1,4 @@ -use candle_core::{IndexOp, Result, Tensor}; +use candle_core::{DType, IndexOp, Result, Tensor}; use candle_nn::{rnn, Module, VarBuilder, RNN}; @@ -44,108 +44,84 @@ impl BidirectionalLSTM { }) } - fn apply_bidirectional_layer( - &self, - input: &Tensor, - lstm_forward: &rnn::LSTM, - lstm_backward: &rnn::LSTM, - h0: &Tensor, - c0: &Tensor, - ) -> Result<(Tensor, (Tensor, Tensor))> { - let (_batch_size, seq_len, _input_size) = input.dims3()?; + fn apply_bidirectional_layer(&self, input: &Tensor, lstm_forward: &rnn::LSTM, lstm_backward: &rnn::LSTM, h0: &Tensor, c0: &Tensor, layer_idx: &i32) -> Result<(Tensor, (Tensor, Tensor))> { + let (batch_size, seq_len, input_size) = input.dims3()?; - log::debug!("Entering apply_bidirectional_layer"); + // Print first and last 5 values of the original input + let input_vec = input.to_vec3::()?; - // Forward - let h0_forward = h0.i(0)?; - let c0_forward = c0.i(0)?; - let mut state_fw = rnn::LSTMState { h: h0_forward, c: c0_forward }; - - let mut out_fw_states = Vec::with_capacity(seq_len); - for t in 0..seq_len { - let xt = input.i((.., t..=t, ..))?.squeeze(1)?.contiguous()?.clone(); - - log::debug!("[forward] xt shape: {:?}, strides: {:?}, is_contiguous: {}", xt.shape(), xt.stride(), xt.is_contiguous()); + // Forward pass + let h0_forward = h0.narrow(0, 0, 1)?.reshape((batch_size, h0.dim(2)?))?; + let c0_forward = c0.narrow(0, 0, 1)?.reshape((batch_size, c0.dim(2)?))?; - log::debug!("[forward] [step][fw] xt shape: {:?}, strides: {:?}", xt.shape(), xt.stride()); - log::debug!("[forward] [step][fw] h shape: {:?}, strides: {:?}", state_fw.h.shape(), state_fw.h.stride()); - log::debug!("[forward] [step][fw] c shape: {:?}, strides: {:?}", state_fw.c.shape(), state_fw.c.stride()); + let state_forward = rnn::LSTMState{ h: h0_forward.clone(), c: c0_forward.clone() }; - state_fw = lstm_forward.step(&xt, &state_fw)?; - out_fw_states.push(state_fw.clone()); - } - let out_fw = Tensor::stack(&out_fw_states.iter().map(|s: &rnn::LSTMState| s.h()).collect::>(), 1)?.contiguous()?; - let last_fw_h = out_fw_states.last().unwrap().h().clone().contiguous()?; - let last_fw_c = out_fw_states.last().unwrap().c().clone().contiguous()?; + let output_forward_states: Vec = lstm_forward.seq_init(&input, &state_forward)?; + let output_forward = Tensor::stack(&output_forward_states.iter().map(|state| state.h().clone()).collect::>(), 1)?; + let last_forward_state = output_forward_states.last().unwrap().h().clone(); - // Backward - let h0_backward = h0.i(1)?; - let c0_backward = c0.i(1)?; - let mut state_bw = rnn::LSTMState { h: h0_backward, c: c0_backward }; - - let mut out_bw_states = Vec::with_capacity(seq_len); - for t in 0..seq_len { - let xt = input.i((.., t..=t, ..))?.squeeze(1)?.contiguous()?.clone(); + // Backward pass + let h0_backward = h0.narrow(0, 1, 1)?.reshape((batch_size, h0.dim(2)?))?; + let c0_backward = c0.narrow(0, 1, 1)?.reshape((batch_size, c0.dim(2)?))?; - log::debug!("[backward] xt shape: {:?}, strides: {:?}, is_contiguous: {}", xt.shape(), xt.stride(), xt.is_contiguous()); - - log::debug!("[backward] [step][fw] xt shape: {:?}, strides: {:?}", xt.shape(), xt.stride()); - log::debug!("[backward] [step][fw] h shape: {:?}, strides: {:?}", state_bw.h.shape(), state_bw.h.stride()); - log::debug!("[backward] [step][fw] c shape: {:?}, strides: {:?}", state_bw.c.shape(), state_bw.c.stride()); - - state_bw = lstm_backward.step(&xt, &state_bw)?; - out_bw_states.push(state_bw.clone()); + let state_backward = rnn::LSTMState{ h: h0_backward.clone(), c: c0_backward.clone() }; + + // Correctly reverse the input sequence + let mut reversed_input = vec![vec![vec![0.0; input_size]; seq_len]; batch_size]; + for b in 0..batch_size { + for t in 0..seq_len { + for i in 0..input_size { + reversed_input[b][seq_len - t - 1][i] = input_vec[b][t][i]; + } + } } + let input_reversed = Tensor::new(reversed_input, input.device())? + .to_dtype(DType::F32)? + .reshape((batch_size, seq_len, input_size))?; + + // Print first and last 5 values of the reversed input + // let reversed_input_vec = input_reversed.to_vec3::()?; + + + let output_backward_states = lstm_backward.seq_init(&input_reversed, &state_backward)?; + let output_backward = Tensor::stack(&output_backward_states.iter().map(|state| state.h().clone()).collect::>(), 1)?; - out_bw_states.reverse(); - let out_bw = Tensor::stack(&out_bw_states.iter().map(|s: &rnn::LSTMState| s.h()).collect::>(), 1)?.contiguous()?; - let last_bw_h = out_bw_states.last().unwrap().h().clone().contiguous()?; - let last_bw_c = out_bw_states.last().unwrap().c().clone().contiguous()?; + // Use the last state of the backward LSTM (which corresponds to the first element of the original sequence) + let last_backward_state = output_backward_states.last().unwrap().h().clone(); + + // Combine the forward and backward hidden states for hn + let hn = Tensor::cat(&[last_forward_state.unsqueeze(0)?, last_backward_state.unsqueeze(0)?], 0)?; // Shape: [2, 1, 128] + let hn_concat = Tensor::cat(&[last_forward_state, last_backward_state], 1)?; // Shape: [1, 256] + + // Combine the forward and backwards cell states for cn + let cn = Tensor::cat(&[output_forward_states.last().unwrap().c().clone(), output_backward_states.last().unwrap().c().clone()], 0)?; // Shape: [2, 1, 128] - let hn = Tensor::stack(&[last_fw_h, last_bw_h], 0)?; - let cn = Tensor::stack(&[last_fw_c, last_bw_c], 0)?; - let output = Tensor::cat(&[out_fw, out_bw], 2)?; + // The output_backward is already in the correct order for the original sequence + let output = Tensor::cat(&[output_forward, output_backward], 2)?; // Shape: [1, 13, 256] Ok((output, (hn, cn))) } - - - /// Forward with hidden states returned - pub fn forward_with_state(&self, xs: &Tensor) -> Result<(Tensor, (Tensor, Tensor))> { - log::debug!("Input xs shape: {:?}, is_contiguous: {}", xs.shape(), xs.is_contiguous()); - - let (batch_size, _, _) = xs.dims3()?; - let h0 = self.h0.expand((self.num_layers * 2, batch_size, self.hidden_size))?; - let c0 = self.c0.expand((self.num_layers * 2, batch_size, self.hidden_size))?; + // New method that returns output and states + pub fn forward_with_state(&self, xs: &Tensor) -> Result<(Tensor, (Tensor, Tensor))> { + let (batch_size, seq_len, input_size) = xs.dims3()?; + + let h0 = &self.h0.expand((self.num_layers * 2, batch_size, self.hidden_size))?; + let c0 = &self.c0.expand((self.num_layers * 2, batch_size, self.hidden_size))?; + let h0_1 = h0.narrow(0, 0, 2)?; - let c0_1 = c0.narrow(0, 0, 2)?; let h0_2 = h0.narrow(0, 2, 2)?; + let c0_1 = c0.narrow(0, 0, 2)?; let c0_2 = c0.narrow(0, 2, 2)?; - - let xs = xs.contiguous()?; - log::debug!("xs after contiguous shape: {:?}, is_contiguous: {}", xs.shape(), xs.is_contiguous()); - - log::debug!("forward_with_state: xs shape = {:?}, strides = {:?}", xs.shape(), xs.stride()); - log::debug!("h0_1 shape: {:?}, strides: {:?}", h0.shape(), h0.stride()); - log::debug!("c0_1 shape: {:?}, strides: {:?}", c0.shape(), c0.stride()); - let (out1, (hn1, cn1)) = self.apply_bidirectional_layer(&xs, &self.forward_lstm1, &self.backward_lstm1, &h0_1, &c0_1)?; - - let out1 = out1.contiguous()?; - log::debug!("out1 after first layer shape: {:?}, is_contiguous: {}", out1.shape(), out1.is_contiguous()); + let (layer1_output, (hn1, cn1)) = self.apply_bidirectional_layer(xs, &self.forward_lstm1, &self.backward_lstm1, &h0_1, &c0_1, &1)?; + let (layer2_output, (hn2, cn2)) = self.apply_bidirectional_layer(&layer1_output, &self.forward_lstm2, &self.backward_lstm2, &h0_2, &c0_2, &2)?; - log::debug!("forward_with_state: out1 shape = {:?}, strides = {:?}", out1.shape(), out1.stride()); - log::debug!("h0_2 shape: {:?}, strides: {:?}", h0.shape(), h0.stride()); - log::debug!("c0_2 shape: {:?}, strides: {:?}", c0.shape(), c0.stride()); + let final_hn = Tensor::cat(&[hn1, hn2], 0)?; + let final_cn = Tensor::cat(&[cn1, cn2], 0)?; - - let (out2, (hn2, cn2)) = self.apply_bidirectional_layer(&out1, &self.forward_lstm2, &self.backward_lstm2, &h0_2, &c0_2)?; - - let hn = Tensor::cat(&[hn1, hn2], 0)?; - let cn = Tensor::cat(&[cn1, cn2], 0)?; - Ok((out2, (hn, cn))) + Ok((layer2_output, (final_hn, final_cn))) } pub fn input_size(&self) -> usize { From 8c7c70e581d4cc5243cb17790afedc29419a9244 Mon Sep 17 00:00:00 2001 From: singjc Date: Tue, 27 May 2025 23:00:04 -0400 Subject: [PATCH 75/75] minor --- .../src/properties/train/trainer.rs | 41 +++++-- .../src/building_blocks/building_blocks.rs | 24 +++-- .../src/models/model_interface.rs | 101 ++++++------------ .../src/models/rt_cnn_transformer_model.rs | 6 +- 4 files changed, 81 insertions(+), 91 deletions(-) diff --git a/crates/redeem-cli/src/properties/train/trainer.rs b/crates/redeem-cli/src/properties/train/trainer.rs index cd06a79..0cec5b3 100644 --- a/crates/redeem-cli/src/properties/train/trainer.rs +++ b/crates/redeem-cli/src/properties/train/trainer.rs @@ -164,7 +164,7 @@ pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { let mut overview_section = ReportSection::new("Overview"); overview_section.add_content(html! { - "This report summarizes the training process of the ReDeeM model. It includes epoch-level summaries and step-wise dynamics such as learning rate scheduling and accuracy tracking over time. These plots provide insight into model convergence behavior and training stability." + "This report summarizes the training process of the {} model. It includes epoch-level summaries and step-wise dynamics such as learning rate scheduling and accuracy tracking over time. These plots provide insight into model convergence behavior and training stability." }); let epoch_losses = train_step_metrics.summarize_loss_for_plotting(); @@ -209,17 +209,38 @@ pub fn run_training(config: &PropertyTrainConfig) -> Result<()> { .iter() .zip(&inference_results) .filter_map(|(true_pep, pred_pep)| { - match (true_pep.retention_time, pred_pep.retention_time) { - (Some(t), Some(p)) => { - let t_denorm = match norm_factor { - TargetNormalization::ZScore(mean, std) => t as f64 * std as f64 + mean as f64, - TargetNormalization::MinMax(min, range) => t as f64 * range as f64 + min as f64, - TargetNormalization::None => t as f64, - }; - Some((t_denorm, p as f64)) + // check if model is RT or CCS + if config.model_arch == "ccs_cnn_lstm" || config.model_arch == "ccs_cnn_tf" { + match (true_pep.ccs, pred_pep.ccs) { + (Some(t), Some(p)) => { + let t_denorm = match norm_factor { + TargetNormalization::ZScore(mean, std) => t as f64 * std as f64 + mean as f64, + TargetNormalization::MinMax(min, range) => t as f64 * range as f64 + min as f64, + TargetNormalization::None => t as f64, + }; + Some((t_denorm, p as f64)) + } + _ => None, + } - _ => None, } + else if config.model_arch == "rt_cnn_lstm" || config.model_arch == "rt_cnn_tf" { + match (true_pep.retention_time, pred_pep.retention_time) { + (Some(t), Some(p)) => { + let t_denorm = match norm_factor { + TargetNormalization::ZScore(mean, std) => t as f64 * std as f64 + mean as f64, + TargetNormalization::MinMax(min, range) => t as f64 * range as f64 + min as f64, + TargetNormalization::None => t as f64, + }; + Some((t_denorm, p as f64)) + } + _ => None, + + } + } else { + return None; + } + }) .unzip(); diff --git a/crates/redeem-properties/src/building_blocks/building_blocks.rs b/crates/redeem-properties/src/building_blocks/building_blocks.rs index d99c914..f72823a 100644 --- a/crates/redeem-properties/src/building_blocks/building_blocks.rs +++ b/crates/redeem-properties/src/building_blocks/building_blocks.rs @@ -850,12 +850,15 @@ impl Encoder26aaModCnnLstmAttnSum { .map_err(|e| candle_core::Error::Msg(e.to_string()))?; let (mean, min, max) = get_tensor_stats(&x)?; - log::trace!("[Encoder26aaModCnnLstmAttnSum] one-hot output stats - min: {min}, max: {max}, mean: {mean}"); + // log::trace!("[Encoder26aaModCnnLstmAttnSum] one-hot output stats - min: {min}, max: {max}, mean: {mean}"); - let start_time = Instant::now(); let x = self.input_cnn.forward(&x)?; + let (b, s, d) = x.dims3()?; + println!("x (post input_cnn): batch size: {b}, seq len: {s}, embedding dim: {d}"); let x = self.input_lstm.forward(&x)?; + let (b, s, d) = x.dims3()?; + println!("x (post input_lstm): batch size: {b}, seq len: {s}, embedding dim: {d}"); let x = self.attn_sum.forward(&x)?; @@ -982,8 +985,8 @@ impl Encoder26aaModCnnTransformerAttnSum { names_input_cnn_bias, )?, proj_cnn_to_transformer: candle_nn::Linear::new( - varstore.get((input_dim * 4, hidden_dim), "proj_cnn_to_transformer.weight")?, - Some(varstore.get(hidden_dim, "proj_cnn_to_transformer.bias")?), + varstore.get((hidden_dim, input_dim * 4), "proj_cnn_to_transformer.weight")?, + None, ), input_transformer: SeqTransformer::from_varstore( varstore.pp(transformer_pp).clone(), @@ -1020,7 +1023,7 @@ impl Encoder26aaModCnnTransformerAttnSum { Ok(Self { mod_nn: ModEmbeddingFixFirstK::new(MOD_FEATURE_SIZE, mod_hidden_dim, &varbuilder.pp("mod_nn"))?, input_cnn: SeqCNN::new(input_dim, &varbuilder.pp("input_cnn"))?, - proj_cnn_to_transformer: candle_nn::linear_no_bias(input_dim*4, hidden_dim, varbuilder.pp("proj_cnn_to_transformer"))?, + proj_cnn_to_transformer: candle_nn::linear_no_bias(input_dim * 4, hidden_dim, varbuilder.pp("proj_cnn_to_transformer"))?, input_transformer: SeqTransformer::new( &varbuilder.pp("input_transformer"), input_dim * 4, @@ -1045,17 +1048,19 @@ impl Encoder26aaModCnnTransformerAttnSum { .map_err(|e| candle_core::Error::Msg(e.to_string()))?; let (mean, min, max) = get_tensor_stats(&x)?; - log::trace!("[Encoder26aaModCnnTransformerAttnSum] one-hot output stats - min: {min}, max: {max}, mean: {mean}"); + // log::trace!("[Encoder26aaModCnnTransformerAttnSum] one-hot output stats - min: {min}, max: {max}, mean: {mean}"); if !mean.is_finite() || !min.is_finite() || !max.is_finite() { log::error!("ERROR [Encoder26aaModCnnTransformerAttnSum] aa_one_hot produced non-finite tensor stats: mean={mean}, min={min}, max={max}"); - candle_core::bail!("ERRORNon-finite values found in peptide encoding output."); + candle_core::bail!("ERROR: Non-finite values found in peptide encoding output."); } let x = self.input_cnn.forward(&x)?; let x = x.contiguous()?; + let x = self.proj_cnn_to_transformer.forward(&x)?; let x = x.contiguous()?; + let x = self.input_transformer.forward(&x)?; let x = x.contiguous()?; let x = self.attn_sum.forward(&x)?; @@ -1107,9 +1112,8 @@ impl Encoder26aaModChargeCnnTransformerAttnSum { names_input_cnn_bias, )?, proj_cnn_to_transformer: candle_nn::Linear::new( - varstore.get((input_dim * 4, hidden_dim), "proj_cnn_to_transformer.weight")?, - Some(varstore.get(hidden_dim, "proj_cnn_to_transformer.bias")?), - ), + varstore.get((hidden_dim, input_dim * 4), "proj_cnn_to_transformer.weight")?, + None), input_transformer: SeqTransformer::from_varstore( varstore.pp(transformer_pp).clone(), input_dim * 4, diff --git a/crates/redeem-properties/src/models/model_interface.rs b/crates/redeem-properties/src/models/model_interface.rs index d4b7194..d5154a0 100644 --- a/crates/redeem-properties/src/models/model_interface.rs +++ b/crates/redeem-properties/src/models/model_interface.rs @@ -308,10 +308,10 @@ pub trait ModelInterface: Send + Sync + ModelClone { let mod_feature_size = self.get_mod_element_count(); let mod_to_feature = self.get_mod_to_feature(); - log::trace!( - "[ModelInterface::encode_peptide] peptide_sequence: {:?} | mods: {:?} | mod_sites: {:?} | charge: {:?} | nce: {:?} | instrument: {:?}", - peptide_sequence, mods, mod_sites, charge, nce, instrument - ); + // log::trace!( + // "[ModelInterface::encode_peptide] peptide_sequence: {:?} | mods: {:?} | mod_sites: {:?} | charge: {:?} | nce: {:?} | instrument: {:?}", + // peptide_sequence, mods, mod_sites, charge, nce, instrument + // ); let aa_tensor = aa_indices_tensor_from_arc(peptide_sequence, device)?; let (batch_size, seq_len, _) = aa_tensor.shape().dims3()?; @@ -627,6 +627,9 @@ pub trait ModelInterface: Send + Sync + ModelClone { let acc = match self.property_type() { PropertyType::RT => { + // print first 5 predictions and targets + println!("Predictions: {:?}", &predictions[0..5]); + println!("Targets: {:?}", &targets[0..5]); Some(Metrics::accuracy(&predictions, &targets, 0.5)) } PropertyType::CCS => { @@ -739,7 +742,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { &self, inference_data: &Vec, batch_size: usize, - _modifications: HashMap< + modifications: HashMap< (String, Option), crate::utils::peptdeep_utils::ModificationMap, >, @@ -760,73 +763,33 @@ pub trait ModelInterface: Send + Sync + ModelClone { .enumerate() .map(|(batch_idx, batch_data)| { let start_idx = batch_idx * batch_size; - let batch: PeptideBatchData = batch_data.into(); - let naked_sequences = &batch.naked_sequence; - let mods = &batch.mods; - let mod_sites = &batch.mod_sites; - - let charges = if batch.charges.iter().all(|c| c.is_some()) { - Some(batch.charges.iter().map(|c| c.unwrap()).collect::>()) - } else { - None - }; + // Extract input features only (ignore targets) + let (input_tensor, _) = self.prepare_batch_inputs(batch_data, &modifications)?; + let predicted = self.forward(&input_tensor)?; - let nces = if batch.nces.iter().all(|n| n.is_some()) { - Some(batch.nces.iter().map(|n| n.unwrap()).collect::>()) - } else { - None - }; - - let instruments = if batch.instruments.iter().all(|i| i.is_some()) { - Some(batch.instruments.clone()) - } else { - None - }; + let predictions = predicted.to_vec1::()?; - let input_tensor = self - .encode_peptides(naked_sequences, mods, mod_sites, charges, nces, instruments)? - .to_device(self.get_device())?; - let output = self.forward(&input_tensor)?; + let updated = predictions + .into_iter() + .enumerate() + .map(|(i, pred)| { + let mut peptide = batch_data[i].clone(); + let value = match target_norm { + TargetNormalization::ZScore(mean, std) => pred * std + mean, + TargetNormalization::MinMax(min, max) => pred * (max - min) + min, + TargetNormalization::None => pred, + }; + match self.property_type() { + PropertyType::RT => peptide.retention_time = Some(value), + PropertyType::CCS => peptide.ccs = Some(value), + _ => {} + } + (start_idx + i, peptide) + }) + .collect::>(); - match self.property_type() { - PropertyType::RT | PropertyType::CCS => { - let predictions = output.to_vec1()?; - let updated: Vec<(usize, PeptideData)> = predictions - .into_iter() - .enumerate() - .map(|(i, pred)| { - let mut peptide = batch_data[i].clone(); - match self.property_type() { - PropertyType::RT => { - peptide.retention_time = Some(match target_norm { - TargetNormalization::ZScore(mean, std) => pred * std + mean, - TargetNormalization::MinMax(min, max) => { - pred * (max - min) + min - } - TargetNormalization::None => pred, - }); - } - PropertyType::CCS => { - peptide.ccs = Some(match target_norm { - TargetNormalization::ZScore(mean, std) => pred * std + mean, - TargetNormalization::MinMax(min, max) => { - pred * (max - min) + min - } - TargetNormalization::None => pred, - }); - } - _ => {} - } - (start_idx + i, peptide) - }) - .collect(); - Ok(updated) - } - PropertyType::MS2 => Err(anyhow::anyhow!( - "Inference not supported for MS2 models in batch mode" - )), - } + Ok(updated) }) .collect::>>>()? .into_iter() @@ -838,7 +801,7 @@ pub trait ModelInterface: Send + Sync + ModelClone { progress.finish(); Ok(result.into_iter().flatten().collect()) - } + } /// Extract encoded input and target tensor for a batch of peptides. diff --git a/crates/redeem-properties/src/models/rt_cnn_transformer_model.rs b/crates/redeem-properties/src/models/rt_cnn_transformer_model.rs index 3743261..1931208 100644 --- a/crates/redeem-properties/src/models/rt_cnn_transformer_model.rs +++ b/crates/redeem-properties/src/models/rt_cnn_transformer_model.rs @@ -47,6 +47,7 @@ impl ModelInterface for RTCNNTFModel { "rt_cnn_tf" } + /// Create a new RTCNNTFModel to train fn new_untrained(device: Device) -> Result { let mut varmap = VarMap::new(); let varbuilder = VarBuilder::from_varmap(&varmap, DType::F32, &device); @@ -82,7 +83,7 @@ impl ModelInterface for RTCNNTFModel { }) } - /// Create a new RTCNNTFModel from the given model and constants files. + /// Create a new RTCNNTFModel from the given pretrained model and constants files. fn new>( model_path: P, constants_path: Option

, @@ -93,6 +94,7 @@ impl ModelInterface for RTCNNTFModel { device: Device, ) -> Result { let tensor_data = load_tensors_from_model(model_path.as_ref(), &device)?; + let mut varmap = candle_nn::VarMap::new(); create_var_map(&mut varmap, tensor_data, &device)?; let var_store = candle_nn::VarBuilder::from_varmap(&varmap, DType::F32, &device); @@ -112,7 +114,7 @@ impl ModelInterface for RTCNNTFModel { 256, // ff_dim 4, // num_heads 2, // num_layers - 100, // max_len (set appropriately for your sequence length) + 100, // max_len (sequence length) 0.1, // dropout_prob vec!["rt_encoder.mod_nn.nn.weight"], vec![