diff --git a/R/utils_normalize.R b/R/utils_normalize.R index 70820e2..fe2c0f1 100644 --- a/R/utils_normalize.R +++ b/R/utils_normalize.R @@ -192,32 +192,31 @@ MSstatsNormalize = function(input, normalization_method, peptides_dict = NULL, s .normalizeGlobalStandards = function(input, peptides_dict, standards) { PeptideSequence = PEPTIDE = PROTEIN = median_by_fraction = NULL Standard = FRACTION = LABEL = ABUNDANCE = RUN = GROUP = NULL - - proteins = as.character(unique(input$PROTEIN)) - means_by_standard = unique(input[, list(RUN)]) - for (standard_id in seq_along(standards)) { - peptide_name = unlist(peptides_dict[PeptideSequence == standards[standard_id], - as.character(PEPTIDE)], FALSE, FALSE) - if (length(peptide_name) > 0) { - standard = input[PEPTIDE == peptide_name, ] - } else { - if (standards[standard_id] %in% proteins) { - standard = input[PROTEIN == standards[standard_id], ] - } else { - msg = paste("global standard peptides or proteins, ", - standards[standard_id],", is not in dataset.", - "Please check whether 'nameStandards' input is correct or not.") - getOption("MSstatsLog")("ERROR", msg) - stop(msg) - } - } - mean_by_run = standard[GROUP != "0" & !is.na(ABUNDANCE), - list(mean_abundance = mean(ABUNDANCE, na.rm = TRUE)), - by = "RUN"] - colnames(mean_by_run)[2] = paste0("meanStandard", standard_id) - means_by_standard = merge(means_by_standard, mean_by_run, - by = "RUN", all.x = TRUE) + + input_with_peptides <- merge(input, peptides_dict, by = "PEPTIDE", all.x = TRUE) + standards_data <- input_with_peptides[ + (PeptideSequence %in% standards | PROTEIN %in% standards) & + GROUP != "0" & + !is.na(ABUNDANCE) + ] + missing_standards <- standards[!standards %in% c(standards_data$PeptideSequence, standards_data$PROTEIN)] + if (length(missing_standards) > 0) { + msg <- paste("Global standard peptides or proteins,", + paste(missing_standards, collapse = ", "), + "are not in dataset. Please check whether 'nameStandards' input is correct.") + getOption("MSstatsLog")("ERROR", msg) + stop(msg) } + standards_data[, standard := ifelse(!is.na(PeptideSequence) & PeptideSequence %in% standards, + PeptideSequence, + PROTEIN)] + means_by_standard <- standards_data[, + list(mean_abundance = mean(ABUNDANCE, na.rm = TRUE)), + by = .(RUN, standard) + ] + means_by_standard <- dcast(means_by_standard, + RUN ~ standard, + value.var = "mean_abundance") means_by_standard = data.table::melt(means_by_standard, id.vars = "RUN", variable.name = "Standard", value.name = "ABUNDANCE") means_by_standard[, mean_by_run := mean(ABUNDANCE, na.rm = TRUE), by = "RUN"] diff --git a/inst/tinytest/test_utils_normalize.R b/inst/tinytest/test_utils_normalize.R new file mode 100644 index 0000000..9a44c39 --- /dev/null +++ b/inst/tinytest/test_utils_normalize.R @@ -0,0 +1,111 @@ +# Test for .normalizeGlobalStandards function ---------------------------------- + +# Setup test data --------------------------------------------------------------- + +create_peptide_dictionary <- function() { + data.table::data.table( + PeptideSequence = c("AAAAAAAAAAAAAAGAGAGAK", "AAAAAAAAAAAAAAGAGAGAK", "AAAAAAAAAAAAVSRD"), + PrecursorCharge = c(3, 2, 3), + PEPTIDE = c("AAAAAAAAAAAAAAGAGAGAK_3", "AAAAAAAAAAAAAAGAGAGAK_2", "AAAAAAAAAAAAVSRD_3") + ) +} + +create_test_input <- function(standard_intensities, peptide2_intensities, peptide3_intensities) { + # Constants + n_proteins <- 3 + n_runs <- 48 + n_subjects <- 4 + n_fractions <- 6 + + # Create base structure + input <- data.table::data.table( + PROTEIN = rep(c("P1", "P1", "P3"), each = n_runs), + PEPTIDE = rep(c("AAAAAAAAAAAAAAGAGAGAK_3", "AAAAAAAAAAAAAAGAGAGAK_2", "AAAAAAAAAAAAVSRD_3"), + each = n_runs), + TRANSITION = rep("NA_NA", n_proteins * n_runs), + FEATURE = rep(c("AAAAAAAAAAAAAAGAGAGAK_3_NA_NA", + "AAAAAAAAAAAAAAGAGAGAK_2_NA_NA", + "AAAAAAAAAAAAVSRD_3_NA_NA"), + each = n_runs), + LABEL = rep("L", n_proteins * n_runs), + GROUP_ORIGINAL = rep(rep(c("Control", "Treatment"), each = n_runs/2), n_proteins), + SUBJECT_ORIGINAL = rep(paste0("Subject", rep(1:n_subjects, each = n_fractions)), + n_proteins * 2), + RUN = rep(1:n_runs, n_proteins), + GROUP = rep(rep(c("Control", "Treatment"), each = n_runs/2), n_proteins), + SUBJECT = rep(paste0("Subject", rep(1:n_subjects, each = n_fractions)), + n_proteins * 2), + FRACTION = rep(rep(1:n_fractions, n_subjects), n_proteins * 2), + INTENSITY = c(standard_intensities, peptide2_intensities, peptide3_intensities), + ANOMALYSCORES = rep(NA, n_proteins * n_runs), + originalRUN = rep(paste0("Run", 1:n_runs), n_proteins) + ) + + input[, ABUNDANCE := log2(INTENSITY)] + return(input) +} + +# Test 1: Standards with different intensities between groups ------------------- +test_different_group_intensities <- function() { + peptide_dict <- create_peptide_dictionary() + standards <- c("AAAAAAAAAAAAAAGAGAGAK") + + # Control group: 262144, Treatment group: 524288 + standard_intensities <- c( + rep(262144, 24), # Control (runs 1-24) + rep(524288, 24) # Treatment (runs 25-48) + ) + + # Non-standard peptides: all 262144 + peptide3_intensities <- rep(262144, 48) + + input <- create_test_input(standard_intensities, standard_intensities, peptide3_intensities) + output <- MSstats:::.normalizeGlobalStandards(input, peptide_dict, standards) + + # Verify normalization: Control runs should be shifted up, Treatment runs shifted down + control_runs <- 1:24 + treatment_runs <- 25:48 + + # Check Control group (shifted up to match treatment standard) + control_abundance <- output[RUN %in% control_runs & + !is.na(ABUNDANCE) & + !grepl(standards, PEPTIDE)]$ABUNDANCE + expect_true(all(abs(control_abundance - 18.5) < 1e-10), + info = "Control group non-standard peptides should be normalized to 18.5") + + # Check Treatment group (shifted down to match control standard) + treatment_abundance <- output[RUN %in% treatment_runs & + !is.na(ABUNDANCE) & + !grepl(standards, PEPTIDE)]$ABUNDANCE + expect_true(all(abs(treatment_abundance - 17.5) < 1e-10), + info = "Treatment group non-standard peptides should be normalized to 17.5") +} + +# Test 2: Standards with alternating intensities within fractions --------------- +test_alternating_intensities_within_fractions <- function() { + peptide_dict <- create_peptide_dictionary() + standards <- c("AAAAAAAAAAAAAAGAGAGAK") + + # Standard alternates between 262144 and 524288 within each fraction + standard_intensities <- rep(c(262144, 524288), 24) + + # Non-standard peptides: all 262144 + peptide3_intensities <- rep(262144, 48) + + input <- create_test_input(standard_intensities, standard_intensities, peptide3_intensities) + output <- MSstats:::.normalizeGlobalStandards(input, peptide_dict, standards) + + # When standards vary within fractions but average to same level, + # no net normalization should occur + all_runs <- 1:48 + normalized_abundance <- output[RUN %in% all_runs & + !is.na(ABUNDANCE) & + !grepl(standards, PEPTIDE)]$ABUNDANCE + + expect_true(all(abs(normalized_abundance - 18) < 1e-10), + info = "No normalization should occur when standard averages are equal across fractions") +} + +# Run tests --------------------------------------------------------------------- +test_different_group_intensities() +test_alternating_intensities_within_fractions() \ No newline at end of file