diff --git a/DESCRIPTION b/DESCRIPTION
index 1edd5f57..0c1d8d60 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -53,7 +53,7 @@ biocViews: ImmunoOncology, MassSpectrometry, Proteomics, Software, Normalization
LazyData: true
URL: http://msstats.org
BugReports: https://groups.google.com/forum/#!forum/msstats
-RoxygenNote: 7.3.2
+RoxygenNote: 7.3.3
Encoding: UTF-8
NeedsCompilation: no
Packaged: 2017-10-20 02:13:12 UTC; meenachoi
diff --git a/R/RcppExports.R b/R/RcppExports.R
index 507c5628..794c452d 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -2,18 +2,17 @@
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
get_estimable_fixed_random <- function(parameters, contrast) {
- .Call(`_MSstats_get_estimable_fixed_random`, parameters, contrast)
+ .Call(`_MSstats_get_estimable_fixed_random`, parameters, contrast)
}
make_contrast_run_quant <- function(input, coefs, contrast_matrix, counts, is_labeled, is_reference = FALSE) {
- .Call(`_MSstats_make_contrast_run_quant`, input, coefs, contrast_matrix, counts, is_labeled, is_reference)
+ .Call(`_MSstats_make_contrast_run_quant`, input, coefs, contrast_matrix, counts, is_labeled, is_reference)
}
get_linear_summary <- function(input, coefs, counts, is_labeled) {
- .Call(`_MSstats_get_linear_summary`, input, coefs, counts, is_labeled)
+ .Call(`_MSstats_get_linear_summary`, input, coefs, counts, is_labeled)
}
median_polish_summary <- function(x, eps = 0.01, maxiter = 10L) {
- .Call(`_MSstats_median_polish_summary`, x, eps, maxiter)
+ .Call(`_MSstats_median_polish_summary`, x, eps, maxiter)
}
-
diff --git a/R/SDRFconverter.R b/R/SDRFconverter.R
index d66967ed..dd1dc428 100644
--- a/R/SDRFconverter.R
+++ b/R/SDRFconverter.R
@@ -1,118 +1,123 @@
#' Convert SDRF experimental design file into an MSstats annotation file
-#'
+#'
#' Takes an SDRF file and outputs an MSstats annotation file. Note
-#' the information in the SDRF file must be correctly annotated for MSstats so
-#' that MSstats can identify the experimental design. In particular the
-#' biological replicates must be correctly annotated, with group comparison
-#' experiments having a unique ID for each BioReplicate. For more information
-#' on this please see the Supplementary of the most recent
+#' the information in the SDRF file must be correctly annotated for MSstats so
+#' that MSstats can identify the experimental design. In particular the
+#' biological replicates must be correctly annotated, with group comparison
+#' experiments having a unique ID for each BioReplicate. For more information
+#' on this please see the Supplementary of the most recent
#' \href{https://pubs.acs.org/doi/10.1021/acs.jproteome.2c00834}{MSstats paper}
-#'
+#'
#' @param data SDRF annotation file
-#' @param run_name Column name in SDRF file which contains the name of the MS
+#' @param run_name Column name in SDRF file which contains the name of the MS
#' run. The information in this column must match exactly with the run names in
#' the PSM file
-#' @param condition_name Column name in SDRF file which contains information on
+#' @param condition_name Column name in SDRF file which contains information on
#' the conditions in the data.
-#' @param biological_replicate Column name in SDRF file which contains the
-#' identifier for the biological replicte. Note MSstats uses this column to
-#' determine if the experiment is a repeated measure design. BioReplicte IDs
+#' @param biological_replicate Column name in SDRF file which contains the
+#' identifier for the biological replicte. Note MSstats uses this column to
+#' determine if the experiment is a repeated measure design. BioReplicte IDs
#' should only be reused if the replicate was measured multiple times.
-#' @param fraction Column name in SDFT file which contains information on the
-#' fractionation in the data. Only required if data contains fractions. Default
+#' @param fraction Column name in SDFT file which contains information on the
+#' fractionation in the data. Only required if data contains fractions. Default
#' is `NULL`
-#'
+#'
#' @importFrom data.table setDT
-#'
+#'
#' @export
-#'
+#'
#' @examples
#' head(example_SDRF)
-#'
-#' msstats_annotation = SDRFtoAnnotation(example_SDRF)
-#'
+#'
+#' msstats_annotation <- SDRFtoAnnotation(example_SDRF)
+#'
#' head(msstats_annotation)
-SDRFtoAnnotation = function(
- data,
- run_name = "comment[data file]",
- condition_name = "characteristics[disease]",
- biological_replicate = "characteristics[biological replicate]",
- fraction = NULL){
-
- data = data.table::setDT(data)
-
- extract_cols = c(run_name, condition_name, biological_replicate)
- if (!is.null(fraction)){
- extract_cols = c(extract_cols, fraction)
- }
- colnames(data) = MSstatsConvert:::.standardizeColnames(colnames(data))
- extract_cols = MSstatsConvert:::.standardizeColnames(extract_cols)
-
- data = data[, ..extract_cols]
- if (length(colnames(data)) < length(extract_cols)){
- stop("ERROR: One or more of the column passed in the parameters were not found in the data. Please ensure that the column names are correct.")
- }
- data.table::setnames(data, extract_cols,
- c("Run", "Condition", "BioReplicate"))
-
- return(data)
+SDRFtoAnnotation <- function(data,
+ run_name = "comment[data file]",
+ condition_name = "characteristics[disease]",
+ biological_replicate = "characteristics[biological replicate]",
+ fraction = NULL) {
+ data <- data.table::setDT(data)
+
+ extract_cols <- c(run_name, condition_name, biological_replicate)
+ if (!is.null(fraction)) {
+ extract_cols <- c(extract_cols, fraction)
+ }
+ colnames(data) <- MSstatsConvert:::.standardizeColnames(colnames(data))
+ extract_cols <- MSstatsConvert:::.standardizeColnames(extract_cols)
+
+ data <- data[, ..extract_cols]
+ if (length(colnames(data)) < length(extract_cols)) {
+ stop("ERROR: One or more of the column passed in the parameters were not found in the data. Please ensure that the column names are correct.")
+ }
+ data.table::setnames(
+ data, extract_cols,
+ c("Run", "Condition", "BioReplicate")
+ )
+
+ return(data)
}
#' Extract experimental design from MSstats format into SDRF format
-#'
-#' @param data MSstats formatted data that is the output of a dedicated
+#'
+#' @param data MSstats formatted data that is the output of a dedicated
#' converter, such as `MaxQtoMSstatsFormat`, `SkylinetoMSstatsFormat`, ect.
#' @param run_name Run column name in SDRF data
#' @param condition_name Condition column name in SDRF data
#' @param biological_replicate Biological replicate column name in SDRF data
#' @param fraction Fraction column name in SDRF data (if applicable). Default is
#' `NULL`. If there are no fractions keep `NULL`.
-#' @param meta_data A data.frame including any additional meta data for the SDRF
-#' file that is not included in MSstats. This meta data will be added into the
-#' final SDRF file. Please ensure the run names in the meta data matches the
+#' @param meta_data A data.frame including any additional meta data for the SDRF
+#' file that is not included in MSstats. This meta data will be added into the
+#' final SDRF file. Please ensure the run names in the meta data matches the
#' run names in the MSstats data.
-#'
+#'
#' @importFrom data.table as.data.table
-#'
+#'
#' @export
-#'
+#'
#' @examples
-#' mq_ev = data.table::fread(system.file("tinytest/raw_data/MaxQuant/mq_ev.csv",
-#' package = "MSstatsConvert"))
-#' mq_pg = data.table::fread(system.file("tinytest/raw_data/MaxQuant/mq_pg.csv",
-#' package = "MSstatsConvert"))
-#' annot = data.table::fread(system.file("tinytest/raw_data/MaxQuant/annotation.csv",
-#' package = "MSstatsConvert"))
-#' maxq_imported = MaxQtoMSstatsFormat(mq_ev, annot, mq_pg, use_log_file = FALSE)
+#' mq_ev <- data.table::fread(system.file("tinytest/raw_data/MaxQuant/mq_ev.csv",
+#' package = "MSstatsConvert"
+#' ))
+#' mq_pg <- data.table::fread(system.file("tinytest/raw_data/MaxQuant/mq_pg.csv",
+#' package = "MSstatsConvert"
+#' ))
+#' annot <- data.table::fread(system.file("tinytest/raw_data/MaxQuant/annotation.csv",
+#' package = "MSstatsConvert"
+#' ))
+#' maxq_imported <- MaxQtoMSstatsFormat(mq_ev, annot, mq_pg, use_log_file = FALSE)
#' head(maxq_imported)
-#'
-#' SDRF_file = extractSDRF(maxq_imported)
-extractSDRF = function(
- data,
- run_name = "comment[data file]",
- condition_name = "characteristics[disease]",
- biological_replicate = "characteristics[biological replicate]",
- fraction = NULL,
- meta_data = NULL){
-
- extract_cols = c("Condition", "BioReplicate", "Run", "Fraction")
- data = as.data.table(data)
- data = data[, ..extract_cols]
- data = unique(data)
-
- if (is.null(fraction)){
- data$Fraction = NULL
- data.table::setnames(data, c("Condition", "BioReplicate", "Run"),
- c(run_name, condition_name, biological_replicate))
- } else {
- data.table::setnames(data, extract_cols,
- c(run_name, condition_name, biological_replicate, fraction))
- }
-
- if (!is.null(meta_data)){
- meta_data = data.table::setDT(meta_data)
- data = merge(data, meta_data, all.x = TRUE, all.y = TRUE, by = run_name)
- }
-
- return(data)
-}
\ No newline at end of file
+#'
+#' SDRF_file <- extractSDRF(maxq_imported)
+extractSDRF <- function(data,
+ run_name = "comment[data file]",
+ condition_name = "characteristics[disease]",
+ biological_replicate = "characteristics[biological replicate]",
+ fraction = NULL,
+ meta_data = NULL) {
+ extract_cols <- c("Condition", "BioReplicate", "Run", "Fraction")
+ data <- as.data.table(data)
+ data <- data[, ..extract_cols]
+ data <- unique(data)
+
+ if (is.null(fraction)) {
+ data$Fraction <- NULL
+ data.table::setnames(
+ data, c("Condition", "BioReplicate", "Run"),
+ c(run_name, condition_name, biological_replicate)
+ )
+ } else {
+ data.table::setnames(
+ data, extract_cols,
+ c(run_name, condition_name, biological_replicate, fraction)
+ )
+ }
+
+ if (!is.null(meta_data)) {
+ meta_data <- data.table::setDT(meta_data)
+ data <- merge(data, meta_data, all.x = TRUE, all.y = TRUE, by = run_name)
+ }
+
+ return(data)
+}
diff --git a/R/converters.R b/R/converters.R
index 4c57ee34..d392432c 100644
--- a/R/converters.R
+++ b/R/converters.R
@@ -1,4 +1,4 @@
-# For backwards compatibility, MSstats will export some converters in the
+# For backwards compatibility, MSstats will export some converters in the
# MSstats namespace
#' @export
@@ -39,4 +39,4 @@ MSstatsConvert::SkylinetoMSstatsFormat
#' @export
#' @importFrom MSstatsConvert SpectronauttoMSstatsFormat
-MSstatsConvert::SpectronauttoMSstatsFormat
\ No newline at end of file
+MSstatsConvert::SpectronauttoMSstatsFormat
diff --git a/R/dataProcess.R b/R/dataProcess.R
index 4bd00a4d..8b0927bb 100755
--- a/R/dataProcess.R
+++ b/R/dataProcess.R
@@ -1,71 +1,72 @@
#' Process MS data: clean, normalize and summarize before differential analysis
-#'
+#'
#' @param raw name of the raw (input) data set.
#' @param logTrans base of logarithm transformation: 2 (default) or 10.
-#' @param normalization normalization to remove systematic bias between MS runs.
+#' @param normalization normalization to remove systematic bias between MS runs.
#' There are three different normalizations supported:
-#' 'equalizeMedians' (default) represents constant normalization (equalizing the medians)
-#' based on reference signals is performed.
-#' 'quantile' represents quantile normalization based on reference signals
-#' 'globalStandards' represents normalization with global standards proteins.
-#' If FALSE, no normalization is performed. See MSstats vignettes for
+#' 'equalizeMedians' (default) represents constant normalization (equalizing the medians)
+#' based on reference signals is performed.
+#' 'quantile' represents quantile normalization based on reference signals
+#' 'globalStandards' represents normalization with global standards proteins.
+#' If FALSE, no normalization is performed. See MSstats vignettes for
#' recommendations on which normalization option to use.
-#' @param nameStandards optional vector of global standard peptide names.
+#' @param nameStandards optional vector of global standard peptide names.
#' Required only for normalization with global standard peptides.
-#' @param featureSubset "all" (default) uses all features that the data set has.
-#' "top3" uses top 3 features which have highest average of log-intensity across runs.
-#' "topN" uses top N features which has highest average of log-intensity across runs.
-#' It needs the input for n_top_feature option.
-#' "highQuality" flags uninformative feature and outliers. See MSstats vignettes for
+#' @param featureSubset "all" (default) uses all features that the data set has.
+#' "top3" uses top 3 features which have highest average of log-intensity across runs.
+#' "topN" uses top N features which has highest average of log-intensity across runs.
+#' It needs the input for n_top_feature option.
+#' "highQuality" flags uninformative feature and outliers. See MSstats vignettes for
#' recommendations on which feature selection option to use.
-#' @param remove_uninformative_feature_outlier optional. Only required if
-#' featureSubset = "highQuality". TRUE allows to remove
+#' @param remove_uninformative_feature_outlier optional. Only required if
+#' featureSubset = "highQuality". TRUE allows to remove
#' 1) noisy features (flagged in the column feature_quality with "Uninformative"),
-#' 2) outliers (flagged in the column, is_outlier with TRUE,
-#' before run-level summarization. FALSE (default) uses all features and intensities
+#' 2) outliers (flagged in the column, is_outlier with TRUE,
+#' before run-level summarization. FALSE (default) uses all features and intensities
#' for run-level summarization.
#' @param min_feature_count optional. Only required if featureSubset = "highQuality".
#' Defines a minimum number of informative features a protein needs to be considered
#' in the feature selection algorithm.
-#' @param n_top_feature optional. Only required if featureSubset = 'topN'.
+#' @param n_top_feature optional. Only required if featureSubset = 'topN'.
#' It that case, it specifies number of top features that will be used.
#' Default is 3, which means to use top 3 features.
-#' @param summaryMethod "TMP" (default) means Tukey's median polish,
+#' @param summaryMethod "TMP" (default) means Tukey's median polish,
#' which is robust estimation method. "linear" uses linear mixed model.
-#' @param equalFeatureVar only for summaryMethod = "linear". default is TRUE.
-#' Logical variable for whether the model should account for heterogeneous variation
-#' among intensities from different features. Default is TRUE, which assume equal
-#' variance among intensities from features. FALSE means that we cannot assume equal
-#' variance among intensities from features, then we will account for heterogeneous
+#' @param equalFeatureVar only for summaryMethod = "linear". default is TRUE.
+#' Logical variable for whether the model should account for heterogeneous variation
+#' among intensities from different features. Default is TRUE, which assume equal
+#' variance among intensities from features. FALSE means that we cannot assume equal
+#' variance among intensities from features, then we will account for heterogeneous
#' variation from different features.
-#' @param censoredInt Missing values are censored or at random.
-#' 'NA' (default) assumes that all 'NA's in 'Intensity' column are censored.
-#' '0' uses zero intensities as censored intensity.
-#' In this case, NA intensities are missing at random.
-#' The output from Skyline should use '0'.
+#' @param censoredInt Missing values are censored or at random.
+#' 'NA' (default) assumes that all 'NA's in 'Intensity' column are censored.
+#' '0' uses zero intensities as censored intensity.
+#' In this case, NA intensities are missing at random.
+#' The output from Skyline should use '0'.
#' Null assumes that all NA intensites are randomly missing.
-#' @param MBimpute only for summaryMethod = "TMP" and censoredInt = 'NA' or '0'.
-#' TRUE (default) imputes missing values with 'NA' or '0' (depending on censoredInt option)
-#' by Accelerated failure model. If set to FALSE, no missing values are imputed.
+#' @param MBimpute only for summaryMethod = "TMP" and censoredInt = 'NA' or '0'.
+#' TRUE (default) imputes missing values with 'NA' or '0' (depending on censoredInt option)
+#' by Accelerated failure model. If set to FALSE, no missing values are imputed.
#' FALSE is appropriate only when missingness is assumed to be at random.
#' See MSstats vignettes for recommendations on which imputation option to use.
-#' @param remove50missing only for summaryMethod = "TMP". TRUE removes the proteins
+#' @param remove50missing only for summaryMethod = "TMP". TRUE removes the proteins
#' where every run has at least 50\% missing values for each peptide. FALSE is default.
#' @param maxQuantileforCensored Maximum quantile for deciding censored missing values, default is 0.999
#' @param fix_missing Optional, same as the `fix_missing` parameter in MSstatsConvert::MSstatsBalancedDesign function
-#' @param numberOfCores Number of cores for parallel processing. When > 1,
-#' a logfile named `MSstats_dataProcess_log_progress.log` is created to
+#' @param numberOfCores Number of cores for parallel processing. When > 1,
+#' a logfile named `MSstats_dataProcess_log_progress.log` is created to
#' track progress. Only works for Linux & Mac OS. Default is 1.
#' @inheritParams .documentFunction
-#'
+#'
#' @importFrom utils sessionInfo
#' @importFrom data.table as.data.table
-#'
+#'
#' @return A list containing:
#' \describe{
#' \item{FeatureLevelData}{A data frame with feature-level information after processing. Columns include:
#' \describe{
#' \item{PROTEIN}{Identifier for the protein associated with the feature.}
+#' \item{PROTEIN}{Identifier for the protein associated with the feature.}
#' \item{PEPTIDE}{Identifier for the peptide sequence.}
#' \item{TRANSITION}{Identifier for the transition, typically representing a specific ion pair.}
#' \item{FEATURE}{Unique identifier for the feature, which could be a combination of peptide and transition.}
@@ -98,336 +99,365 @@
#' }
#' }
#' }
-#'
+#'
#' @export
-#'
-#' @examples
+#'
+#' @examples
#' # Consider a raw data (i.e. SRMRawData) for a label-based SRM experiment from a yeast study
#' # with ten time points (T1-T10) of interests and three biological replicates.
#' # It is a time course experiment. The goal is to detect protein abundance changes
#' # across time points.
#' head(SRMRawData)
#' # Log2 transformation and normalization are applied (default)
-#' QuantData<-dataProcess(SRMRawData, use_log_file = FALSE)
+#' QuantData <- dataProcess(SRMRawData, use_log_file = FALSE)
#' head(QuantData$FeatureLevelData)
#' # Log10 transformation and normalization are applied
-#' QuantData1<-dataProcess(SRMRawData, logTrans=10, use_log_file = FALSE)
+#' QuantData1 <- dataProcess(SRMRawData, logTrans = 10, use_log_file = FALSE)
#' head(QuantData1$FeatureLevelData)
#' # Log2 transformation and no normalization are applied
-#' QuantData2<-dataProcess(SRMRawData,normalization=FALSE, use_log_file = FALSE)
+#' QuantData2 <- dataProcess(SRMRawData, normalization = FALSE, use_log_file = FALSE)
#' head(QuantData2$FeatureLevelData)
-#'
-dataProcess = function(
+#'
+dataProcess <- function(
raw, logTrans = 2, normalization = "equalizeMedians", nameStandards = NULL,
- featureSubset = "all", remove_uninformative_feature_outlier = FALSE,
- min_feature_count = 2, n_top_feature = 3, summaryMethod = "TMP",
- equalFeatureVar = TRUE, censoredInt = "NA", MBimpute = TRUE,
- remove50missing = FALSE, fix_missing = NULL, maxQuantileforCensored = 0.999,
+ featureSubset = "all", remove_uninformative_feature_outlier = FALSE,
+ min_feature_count = 2, n_top_feature = 3, summaryMethod = "TMP",
+ equalFeatureVar = TRUE, censoredInt = "NA", MBimpute = TRUE,
+ remove50missing = FALSE, fix_missing = NULL, maxQuantileforCensored = 0.999,
use_log_file = TRUE, append = FALSE, verbose = TRUE, log_file_path = NULL,
- numberOfCores = 1
-) {
- MSstatsConvert::MSstatsLogsSettings(use_log_file, append, verbose,
- log_file_path,
- base = "MSstats_dataProcess_log_")
- getOption("MSstatsLog")("INFO", "MSstats - dataProcess function")
- .checkDataProcessParams(
- logTrans, normalization, nameStandards,
- list(method = featureSubset, n_top = n_top_feature,
- remove_uninformative = remove_uninformative_feature_outlier),
- list(method = summaryMethod, equal_var = equalFeatureVar),
- list(symbol = censoredInt, MB = MBimpute))
-
- peptides_dict = makePeptidesDictionary(as.data.table(unclass(raw)), normalization)
- input = MSstatsPrepareForDataProcess(raw, logTrans, fix_missing)
- input = MSstatsNormalize(input, normalization, peptides_dict, nameStandards)
- input = MSstatsMergeFractions(input)
- input = MSstatsHandleMissing(input, summaryMethod, MBimpute,
- censoredInt, maxQuantileforCensored)
- input = MSstatsSelectFeatures(input, featureSubset, n_top_feature,
- min_feature_count)
- .logDatasetInformation(input)
- getOption("MSstatsLog")("INFO",
- "== Start the summarization per subplot...")
- getOption("MSstatsMsg")("INFO",
- " == Start the summarization per subplot...")
-
- processed = getProcessed(input)
- input = MSstatsPrepareForSummarization(input, summaryMethod, MBimpute, censoredInt,
- remove_uninformative_feature_outlier)
- summarized = tryCatch(MSstatsSummarizeWithMultipleCores(input, summaryMethod,
- MBimpute, censoredInt,
- remove50missing, equalFeatureVar,
- numberOfCores),
- error = function(e) {
- print(e)
- NULL
- })
- getOption("MSstatsLog")("INFO",
- "== Summarization is done.")
- getOption("MSstatsMsg")("INFO",
- " == Summarization is done.")
- output = MSstatsSummarizationOutput(input, summarized, processed,
- summaryMethod, MBimpute, censoredInt)
- output
+ numberOfCores = 1) {
+ MSstatsConvert::MSstatsLogsSettings(use_log_file, append, verbose,
+ log_file_path,
+ base = "MSstats_dataProcess_log_"
+ )
+ getOption("MSstatsLog")("INFO", "MSstats - dataProcess function")
+ .checkDataProcessParams(
+ logTrans, normalization, nameStandards,
+ list(
+ method = featureSubset, n_top = n_top_feature,
+ remove_uninformative = remove_uninformative_feature_outlier
+ ),
+ list(method = summaryMethod, equal_var = equalFeatureVar),
+ list(symbol = censoredInt, MB = MBimpute)
+ )
+
+ peptides_dict <- makePeptidesDictionary(as.data.table(unclass(raw)), normalization)
+ input <- MSstatsPrepareForDataProcess(raw, logTrans, fix_missing)
+ input <- MSstatsNormalize(input, normalization, peptides_dict, nameStandards)
+ input <- MSstatsMergeFractions(input)
+ input <- MSstatsHandleMissing(
+ input, summaryMethod, MBimpute,
+ censoredInt, maxQuantileforCensored
+ )
+ input <- MSstatsSelectFeatures(
+ input, featureSubset, n_top_feature,
+ min_feature_count
+ )
+ #.logDatasetInformation(input)
+ getOption("MSstatsLog")("INFO",
+ "== Start the summarization per subplot...")
+ getOption("MSstatsMsg")("INFO",
+ " == Start the summarization per subplot...")
+
+ processed <- getProcessed(input)
+ input <- MSstatsPrepareForSummarization(
+ input, summaryMethod, MBimpute, censoredInt,
+ remove_uninformative_feature_outlier
+ )
+ summarized <- tryCatch(
+ MSstatsSummarizeWithMultipleCores(
+ input, summaryMethod,
+ MBimpute, censoredInt,
+ remove50missing, equalFeatureVar,
+ numberOfCores
+ ),
+ error = function(e) {
+ print(e)
+ NULL
+ }
+ )
+ getOption("MSstatsLog")("INFO",
+ "== Summarization is done.")
+ getOption("MSstatsMsg")("INFO",
+ " == Summarization is done.")
+ output <- MSstatsSummarizationOutput(
+ input, summarized, processed,
+ summaryMethod, MBimpute, censoredInt
+ )
+ output
}
#' Feature-level data summarization with multiple cores
-#'
+#'
#' @param input feature-level data processed by dataProcess subfunctions
-#' @param method summarization method: "linear" or "TMP"
-#' @param equal_variance only for summaryMethod = "linear". Default is TRUE.
-#' Logical variable for whether the model should account for heterogeneous variation
+#' @param method summarization method: "linear" or "TMP"
+#' @param equal_variance only for summaryMethod = "linear". Default is TRUE.
+#' Logical variable for whether the model should account for heterogeneous variation
#' among intensities from different features. Default is TRUE, which assume equal
-#' variance among intensities from features. FALSE means that we cannot assume
+#' variance among intensities from features. FALSE means that we cannot assume
#' equal variance among intensities from features, then we will account for
#' heterogeneous variation from different features.
-#' @param censored_symbol Missing values are censored or at random.
-#' 'NA' (default) assumes that all 'NA's in 'Intensity' column are censored.
-#' '0' uses zero intensities as censored intensity.
-#' In this case, NA intensities are missing at random.
-#' The output from Skyline should use '0'.
+#' @param censored_symbol Missing values are censored or at random.
+#' 'NA' (default) assumes that all 'NA's in 'Intensity' column are censored.
+#' '0' uses zero intensities as censored intensity.
+#' In this case, NA intensities are missing at random.
+#' The output from Skyline should use '0'.
#' Null assumes that all NA intensites are randomly missing.
-#' @param remove50missing only for summaryMethod = "TMP". TRUE removes the proteins
+#' @param remove50missing only for summaryMethod = "TMP". TRUE removes the proteins
#' where every run has at least 50\% missing values for each peptide. FALSE is default.
-#' @param impute only for summaryMethod = "TMP" and censoredInt = 'NA' or '0'.
-#' TRUE (default) imputes 'NA' or '0' (depending on censoredInt option) by Accelated failure model.
+#' @param impute only for summaryMethod = "TMP" and censoredInt = 'NA' or '0'.
+#' TRUE (default) imputes 'NA' or '0' (depending on censoredInt option) by Accelated failure model.
#' FALSE uses the values assigned by cutoffCensored
-#' @param numberOfCores Number of cores for parallel processing. When > 1,
-#' a logfile named `MSstats_dataProcess_log_progress.log` is created to
+#' @param numberOfCores Number of cores for parallel processing. When > 1,
+#' a logfile named `MSstats_dataProcess_log_progress.log` is created to
#' track progress. Only works for Linux & Mac OS. Default is 1.
-#'
-#' @importFrom parallel makeCluster parLapply stopCluster clusterExport
-#'
+#'
+#' @importFrom parallel makeCluster parLapply stopCluster clusterExport
+#'
#' @return list of length one with run-level data.
-#'
-MSstatsSummarizeWithMultipleCores = function(input, method, impute, censored_symbol,
- remove50missing, equal_variance, numberOfCores = 1) {
- if (numberOfCores > 1) {
- protein_indices = split(seq_len(nrow(input)), list(input$PROTEIN))
- num_proteins = length(protein_indices)
- function_environment = environment()
- cl = parallel::makeCluster(numberOfCores)
- getOption("MSstatsLog")("INFO",
- "Starting the cluster setup for summarization")
- parallel::clusterExport(cl, c("MSstatsSummarizeSingleTMP",
- "MSstatsSummarizeSingleLinear",
- "input", "impute", "censored_symbol",
- "remove50missing", "protein_indices",
- "equal_variance"),
- envir = function_environment)
- cat(paste0("Number of proteins to process: ", num_proteins),
- sep = "\n", file = "MSstats_dataProcess_log_progress.log")
- if (method == "TMP") {
- summarized_results = parallel::parLapply(cl, seq_len(num_proteins), function(i) {
- if (i %% 100 == 0) {
- cat("Finished processing an additional 100 proteins",
- sep = "\n", file = "MSstats_dataProcess_log_progress.log", append = TRUE)
- }
- single_protein = input[protein_indices[[i]],]
- MSstatsSummarizeSingleTMP(
- single_protein, impute, censored_symbol, remove50missing)
- })
- } else {
- summarized_results = parallel::parLapply(cl, seq_len(num_proteins), function(i) {
- if (i %% 100 == 0) {
- cat("Finished processing an additional 100 proteins",
- sep = "\n", file = "MSstats_dataProcess_log_progress.log", append = TRUE)
- }
- single_protein = input[protein_indices[[i]],]
- MSstatsSummarizeSingleLinear(single_protein, equal_variance)
- })
- }
- parallel::stopCluster(cl)
- return(summarized_results)
- } else {
- return(MSstatsSummarizeWithSingleCore(input, method, impute, censored_symbol,
- remove50missing, equal_variance))
- }
+#'
+MSstatsSummarizeWithMultipleCores <- function(input, method, impute, censored_symbol, remove50missing, equal_variance, numberOfCores = 1) {
+ if (numberOfCores <= 1) {
+ ret <- MSstatsSummarizeWithSingleCore(
+ input, method, impute, censored_symbol, remove50missing, equal_variance
+ )
+ return(ret)
+ }
+ num_proteins <- length(unique(input$PROTEIN))
+
+ getOption("MSstatsLog")("INFO", "Starting the cluster setup for summarization")
+ cat(paste0("Number of proteins to process: ", num_proteins),
+ sep = "\n", file = "MSstats_dataProcess_log_progress.log"
+ )
+ # cap cores to number of groups
+ ncores <- max(1L, min(numberOfCores, num_proteins))
+
+ # ---- choose summarizer once, bind as a variable to export ----
+ summarizer_fun <- switch(method,
+ "TMP" = MSstatsSummarizeSingleTMP,
+ MSstatsSummarizeSingleLinear
+ )
+
+ # ---- set threads for data.table (not sure if this changes anything) ----
+ data.table::setDTthreads(threads = ncores)
+
+ # process rows using data.table by PROTEIN. this will take care of parallel processing
+ summarized_results <- input[, .(out = summarizer_fun(.SD, impute, censored_symbol, remove50missing)), by = PROTEIN]
+ # because data.table performs row binds, we need to reshape the results.
+ summarized_results <- summarized_results[, .(out = list(out)), by = PROTEIN]
+ summarized_results <- lapply(seq_len(nrow(summarized_results)), function(i) {
+ prot <- summarized_results$PROTEIN[i]
+ out <- summarized_results$out[[i]]
+ names(out) <- NULL
+ # first element must be a data.table; add 'Protein' column
+ dt1 <- as.data.table(out[[1]])
+ dt1[, Protein := prot]
+ # second element left as-is
+ list(dt1, out[[2]])
+ })
+
+ return(summarized_results)
}
+
#' Feature-level data summarization with 1 core
-#'
+#'
#' @inheritParams MSstatsSummarizeWithMultipleCores
-#'
+#'
#' @importFrom data.table uniqueN
#' @importFrom utils setTxtProgressBar
-#'
+#'
#' @return list of length one with run-level data.
-#'
+#'
#' @export
-#'
+#'
#' @examples
-#' raw = DDARawData
-#' method = "TMP"
-#' cens = "NA"
-#' impute = TRUE
+#' raw <- DDARawData
+#' method <- "TMP"
+#' cens <- "NA"
+#' impute <- TRUE
#' MSstatsConvert::MSstatsLogsSettings(FALSE)
-#' input = MSstatsPrepareForDataProcess(raw, 2, NULL)
-#' input = MSstatsNormalize(input, "EQUALIZEMEDIANS")
-#' input = MSstatsMergeFractions(input)
-#' input = MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
-#' input = MSstatsSelectFeatures(input, "all")
-#' processed = getProcessed(input)
-#' input = MSstatsPrepareForSummarization(input, method, impute, cens, FALSE)
-#' summarized = MSstatsSummarizeWithSingleCore(input, method, impute, cens, FALSE, TRUE)
+#' input <- MSstatsPrepareForDataProcess(raw, 2, NULL)
+#' input <- MSstatsNormalize(input, "EQUALIZEMEDIANS")
+#' input <- MSstatsMergeFractions(input)
+#' input <- MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
+#' input <- MSstatsSelectFeatures(input, "all")
+#' processed <- getProcessed(input)
+#' input <- MSstatsPrepareForSummarization(input, method, impute, cens, FALSE)
+#' summarized <- MSstatsSummarizeWithSingleCore(input, method, impute, cens, FALSE, TRUE)
#' length(summarized) # list of summarization outputs for each protein
#' head(summarized[[1]][[1]]) # run-level summary
-#'
-MSstatsSummarizeWithSingleCore = function(input, method, impute, censored_symbol,
- remove50missing, equal_variance) {
-
-
- protein_indices = split(seq_len(nrow(input)), list(input$PROTEIN))
- num_proteins = length(protein_indices)
- summarized_results = vector("list", num_proteins)
- if (method == "TMP") {
- pb = utils::txtProgressBar(min = 0, max = num_proteins, style = 3)
- for (protein_id in seq_len(num_proteins)) {
- single_protein = input[protein_indices[[protein_id]],]
- summarized_results[[protein_id]] = MSstatsSummarizeSingleTMP(
- single_protein, impute, censored_symbol, remove50missing)
- setTxtProgressBar(pb, protein_id)
- }
- close(pb)
- } else {
- pb = utils::txtProgressBar(min = 0, max = num_proteins, style = 3)
- for (protein_id in seq_len(num_proteins)) {
- single_protein = input[protein_indices[[protein_id]],]
- summarized_result = MSstatsSummarizeSingleLinear(single_protein,
- equal_variance)
- summarized_results[[protein_id]] = summarized_result
- setTxtProgressBar(pb, protein_id)
- }
- close(pb)
+#'
+MSstatsSummarizeWithSingleCore <- function(input, method, impute, censored_symbol,
+ remove50missing, equal_variance) {
+ protein_indices <- split(seq_len(nrow(input)), list(input$PROTEIN))
+ num_proteins <- length(protein_indices)
+ summarized_results <- vector("list", num_proteins)
+ if (method == "TMP") {
+ pb <- utils::txtProgressBar(min = 0, max = num_proteins, style = 3)
+ for (protein_id in seq_len(num_proteins)) {
+ single_protein <- input[protein_indices[[protein_id]], ]
+ summarized_results[[protein_id]] <- MSstatsSummarizeSingleTMP(
+ single_protein, impute, censored_symbol, remove50missing
+ )
+ setTxtProgressBar(pb, protein_id)
+ }
+ close(pb)
+ } else {
+ pb <- utils::txtProgressBar(min = 0, max = num_proteins, style = 3)
+ for (protein_id in seq_len(num_proteins)) {
+ single_protein <- input[protein_indices[[protein_id]], ]
+ summarized_result <- MSstatsSummarizeSingleLinear(
+ single_protein,
+ equal_variance
+ )
+ summarized_results[[protein_id]] <- summarized_result
+ setTxtProgressBar(pb, protein_id)
}
- summarized_results
+ close(pb)
+ }
+ summarized_results
}
#' Linear model-based summarization for a single protein
-#'
+#'
#' @param single_protein feature-level data for a single protein
#' @param equal_variances if TRUE, observation are assumed to be homoskedastic
-#'
+#'
#' @return list with protein-level data
-#'
+#'
#' @importFrom stats xtabs
-#'
+#'
#' @export
-#'
+#'
#' @examples
-#' raw = DDARawData
-#' method = "linear"
-#' cens = NULL
-#' impute = FALSE
+#' raw <- DDARawData
+#' method <- "linear"
+#' cens <- NULL
+#' impute <- FALSE
#' # currently, MSstats only supports MBimpute = FALSE for linear summarization
#' MSstatsConvert::MSstatsLogsSettings(FALSE)
-#' input = MSstatsPrepareForDataProcess(raw, 2, NULL)
-#' input = MSstatsNormalize(input, "EQUALIZEMEDIANS")
-#' input = MSstatsMergeFractions(input)
-#' input = MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
-#' input = MSstatsSelectFeatures(input, "all")
-#' input = MSstatsPrepareForSummarization(input, method, impute, cens, FALSE)
-#' input_split = split(input, input$PROTEIN)
-#' single_protein_summary = MSstatsSummarizeSingleLinear(input_split[[1]])
+#' input <- MSstatsPrepareForDataProcess(raw, 2, NULL)
+#' input <- MSstatsNormalize(input, "EQUALIZEMEDIANS")
+#' input <- MSstatsMergeFractions(input)
+#' input <- MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
+#' input <- MSstatsSelectFeatures(input, "all")
+#' input <- MSstatsPrepareForSummarization(input, method, impute, cens, FALSE)
+#' input_split <- split(input, input$PROTEIN)
+#' single_protein_summary <- MSstatsSummarizeSingleLinear(input_split[[1]])
#' head(single_protein_summary[[1]])
-#'
-MSstatsSummarizeSingleLinear = function(single_protein, equal_variances = TRUE) {
- ABUNDANCE = RUN = FEATURE = PROTEIN = LogIntensities = NULL
-
- label = data.table::uniqueN(single_protein$LABEL) > 1
- single_protein = single_protein[!is.na(ABUNDANCE)]
- single_protein[, RUN := factor(RUN)]
- single_protein[, FEATURE := factor(FEATURE)]
-
- counts = xtabs(~ RUN + FEATURE,
- data = unique(single_protein[, list(FEATURE, RUN)]))
- counts = as.matrix(counts)
- is_single_feature = .checkSingleFeature(single_protein)
-
- fit = try(.fitLinearModel(single_protein, is_single_feature, is_labeled = label,
- equal_variances), silent = TRUE)
-
- if (inherits(fit, "try-error")) {
- msg = paste("*** error : can't fit the model for ", unique(single_protein$PROTEIN))
- getOption("MSstatsLog")("WARN", msg)
- getOption("MSstatsMsg")("WARN", msg)
- result = NULL
- } else {
- cf = summary(fit)$coefficients[, 1]
- result = unique(single_protein[, list(Protein = PROTEIN, RUN = RUN)])
- log_intensities = get_linear_summary(single_protein, cf,
- counts, label)
- result[, LogIntensities := log_intensities]
- }
- list(result)
+#'
+MSstatsSummarizeSingleLinear <- function(single_protein, equal_variances = TRUE) {
+ ABUNDANCE <- RUN <- FEATURE <- PROTEIN <- LogIntensities <- NULL
+
+ label <- data.table::uniqueN(single_protein$LABEL) > 1
+ single_protein <- single_protein[!is.na(ABUNDANCE)]
+ single_protein[, RUN := factor(RUN)]
+ single_protein[, FEATURE := factor(FEATURE)]
+
+ counts <- xtabs(~ RUN + FEATURE,
+ data = unique(single_protein[, list(FEATURE, RUN)])
+ )
+ counts <- as.matrix(counts)
+ is_single_feature <- .checkSingleFeature(single_protein)
+
+ fit <- try(.fitLinearModel(single_protein, is_single_feature,
+ is_labeled = label,
+ equal_variances
+ ), silent = TRUE)
+
+ if (inherits(fit, "try-error")) {
+ msg <- paste("*** error : can't fit the model for ", unique(single_protein$PROTEIN))
+ getOption("MSstatsLog")("WARN", msg)
+ getOption("MSstatsMsg")("WARN", msg)
+ result <- NULL
+ } else {
+ cf <- summary(fit)$coefficients[, 1]
+ result <- unique(single_protein[, list(Protein = PROTEIN, RUN = RUN)])
+ log_intensities <- get_linear_summary(
+ single_protein, cf,
+ counts, label
+ )
+ result[, LogIntensities := log_intensities]
+ }
+ list(result)
}
#' Tukey Median Polish summarization for a single protein
-#'
+#'
#' @param single_protein feature-level data for a single protein
#' @inheritParams MSstatsSummarizeWithSingleCore
-#'
+#'
#' @return list of two data.tables: one with fitted survival model,
#' the other with protein-level data
-#'
+#'
#' @importFrom stats predict
-#'
+#'
#' @export
-#'
+#'
#' @examples
-#' raw = DDARawData
-#' method = "TMP"
-#' cens = "NA"
-#' impute = TRUE
+#' raw <- DDARawData
+#' method <- "TMP"
+#' cens <- "NA"
+#' impute <- TRUE
#' # currently, MSstats only supports MBimpute = FALSE for linear summarization
#' MSstatsConvert::MSstatsLogsSettings(FALSE)
-#' input = MSstatsPrepareForDataProcess(raw, 2, NULL)
-#' input = MSstatsNormalize(input, "EQUALIZEMEDIANS")
-#' input = MSstatsMergeFractions(input)
-#' input = MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
-#' input = MSstatsSelectFeatures(input, "all")
-#' input = MSstatsPrepareForSummarization(input, method, impute, cens, FALSE)
-#' input_split = split(input, input$PROTEIN)
-#' single_protein_summary = MSstatsSummarizeSingleTMP(input_split[[1]],
-#' impute, cens, FALSE)
+#' input <- MSstatsPrepareForDataProcess(raw, 2, NULL)
+#' input <- MSstatsNormalize(input, "EQUALIZEMEDIANS")
+#' input <- MSstatsMergeFractions(input)
+#' input <- MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
+#' input <- MSstatsSelectFeatures(input, "all")
+#' input <- MSstatsPrepareForSummarization(input, method, impute, cens, FALSE)
+#' input_split <- split(input, input$PROTEIN)
+#' single_protein_summary <- MSstatsSummarizeSingleTMP(
+#' input_split[[1]],
+#' impute, cens, FALSE
+#' )
#' head(single_protein_summary[[1]])
-#'
-MSstatsSummarizeSingleTMP = function(single_protein, impute, censored_symbol,
- remove50missing) {
- newABUNDANCE = n_obs = n_obs_run = RUN = FEATURE = LABEL = NULL
- predicted = censored = NULL
- cols = intersect(colnames(single_protein), c("newABUNDANCE", "cen", "RUN",
- "FEATURE", "ref"))
- single_protein = single_protein[(n_obs > 1 & !is.na(n_obs)) &
- (n_obs_run > 0 & !is.na(n_obs_run))]
- if (nrow(single_protein) == 0) {
- return(list(NULL, NULL))
- }
- single_protein[, RUN := factor(RUN)]
- single_protein[, FEATURE := factor(FEATURE)]
- if (impute & any(single_protein[["censored"]])) {
- survival_fit = .fitSurvival(single_protein[LABEL == "L", cols,
- with = FALSE])
- single_protein[, predicted := predict(survival_fit,
- newdata = .SD)]
- single_protein[, predicted := ifelse(censored & (LABEL == "L"), predicted, NA)]
- single_protein[, newABUNDANCE := ifelse(censored & LABEL == "L",
- predicted, newABUNDANCE)]
- survival = single_protein[, c(cols, "predicted"), with = FALSE]
- } else {
- survival = single_protein[, cols, with = FALSE]
- survival[, predicted := NA]
- }
-
- single_protein = .isSummarizable(single_protein, remove50missing)
- if (is.null(single_protein)) {
- return(list(NULL, NULL))
- } else {
- single_protein = single_protein[!is.na(newABUNDANCE), ]
- is_labeled = nlevels(single_protein$LABEL) > 1
- result = .runTukey(single_protein, is_labeled, censored_symbol,
- remove50missing)
- }
- list(result, survival)
+#'
+MSstatsSummarizeSingleTMP <- function(single_protein, impute, censored_symbol,
+ remove50missing) {
+ newABUNDANCE <- n_obs <- n_obs_run <- RUN <- FEATURE <- LABEL <- NULL
+ predicted <- censored <- NULL
+ cols <- intersect(colnames(single_protein), c(
+ "newABUNDANCE", "cen", "RUN",
+ "FEATURE", "ref"
+ ))
+ single_protein <- single_protein[(n_obs > 1 & !is.na(n_obs)) &
+ (n_obs_run > 0 & !is.na(n_obs_run))]
+ if (nrow(single_protein) == 0) {
+ return(list(NULL, NULL))
+ }
+ single_protein[, RUN := factor(RUN)]
+ single_protein[, FEATURE := factor(FEATURE)]
+ if (impute & any(single_protein[["censored"]])) {
+ survival_fit <- .fitSurvival(single_protein[LABEL == "L", cols,
+ with = FALSE
+ ])
+ single_protein[, predicted := predict(survival_fit,
+ newdata = .SD
+ )]
+ single_protein[, predicted := ifelse(censored & (LABEL == "L"), predicted, NA)]
+ single_protein[, newABUNDANCE := ifelse(censored & LABEL == "L",
+ predicted, newABUNDANCE
+ )]
+ survival <- single_protein[, c(cols, "predicted"), with = FALSE]
+ } else {
+ survival <- single_protein[, cols, with = FALSE]
+ survival[, predicted := NA]
+ }
+
+ single_protein <- .isSummarizable(single_protein, remove50missing)
+ if (is.null(single_protein)) {
+ return(list(NULL, NULL))
+ } else {
+ single_protein <- single_protein[!is.na(newABUNDANCE), ]
+ is_labeled <- nlevels(single_protein$LABEL) > 1
+ result <- .runTukey(
+ single_protein, is_labeled, censored_symbol,
+ remove50missing
+ )
+ }
+ list(result, survival)
}
diff --git a/R/dataProcessPlots.R b/R/dataProcessPlots.R
index a0ab4e13..6cd80bd1 100644
--- a/R/dataProcessPlots.R
+++ b/R/dataProcessPlots.R
@@ -1,45 +1,45 @@
#' Visualization for explanatory data analysis
-#'
-#' @description To illustrate the quantitative data after data-preprocessing and
-#' quality control of MS runs, dataProcessPlots takes the quantitative data from
-#' function (\code{\link{dataProcess}}) as input and automatically generate
-#' three types of figures in pdf files as output :
-#' (1) profile plot (specify "ProfilePlot" in option type),
-#' to identify the potential sources of variation for each protein;
-#' (2) quality control plot (specify "QCPlot" in option type),
-#' to evaluate the systematic bias between MS runs;
-#' (3) mean plot for conditions (specify "ConditionPlot" in option type),
-#' to illustrate mean and variability of each condition per protein.
-#'
+#'
+#' @description To illustrate the quantitative data after data-preprocessing and
+#' quality control of MS runs, dataProcessPlots takes the quantitative data from
+#' function (\code{\link{dataProcess}}) as input and automatically generate
+#' three types of figures in pdf files as output :
+#' (1) profile plot (specify "ProfilePlot" in option type),
+#' to identify the potential sources of variation for each protein;
+#' (2) quality control plot (specify "QCPlot" in option type),
+#' to evaluate the systematic bias between MS runs;
+#' (3) mean plot for conditions (specify "ConditionPlot" in option type),
+#' to illustrate mean and variability of each condition per protein.
+#'
#' @param data name of the (output of dataProcess function) data set.
-#' @param type choice of visualization. "ProfilePlot" represents profile plot of
-#' log intensities across MS runs. "QCPlot" represents quality control plot of
-#' log intensities across MS runs. "ConditionPlot" represents mean plot of log
+#' @param type choice of visualization. "ProfilePlot" represents profile plot of
+#' log intensities across MS runs. "QCPlot" represents quality control plot of
+#' log intensities across MS runs. "ConditionPlot" represents mean plot of log
#' ratios (Light/Heavy) across conditions.
-#' @param featureName for "ProfilePlot" only, "Transition" (default) means
-#' printing feature legend in transition-level; "Peptide" means printing feature
+#' @param featureName for "ProfilePlot" only, "Transition" (default) means
+#' printing feature legend in transition-level; "Peptide" means printing feature
#' legend in peptide-level; "NA" means no feature legend printing.
-#' @param ylimUp upper limit for y-axis in the log scale. FALSE(Default) for
-#' Profile Plot and QC Plot use the upper limit as rounded off maximum of
+#' @param ylimUp upper limit for y-axis in the log scale. FALSE(Default) for
+#' Profile Plot and QC Plot use the upper limit as rounded off maximum of
#' log2(intensities) after normalization + 3. FALSE(Default) for Condition Plot
#' is maximum of log ratio + SD or CI.
-#' @param ylimDown lower limit for y-axis in the log scale. FALSE(Default) for
+#' @param ylimDown lower limit for y-axis in the log scale. FALSE(Default) for
#' Profile Plot and QC Plot is 0. FALSE(Default) for Condition Plot is minumum
#' of log ratio - SD or CI.
-#' @param scale for "ConditionPlot" only, FALSE(default) means each conditional
-#' level is not scaled at x-axis according to its actual value (equal space at
+#' @param scale for "ConditionPlot" only, FALSE(default) means each conditional
+#' level is not scaled at x-axis according to its actual value (equal space at
#' x-axis). TRUE means each conditional level is scaled at x-axis according to
#' its actual value (unequal space at x-axis).
-#' @param interval for "ConditionPlot" only, "CI"(default) uses confidence
-#' interval with 0.95 significant level for the width of error bar.
+#' @param interval for "ConditionPlot" only, "CI"(default) uses confidence
+#' interval with 0.95 significant level for the width of error bar.
#' "SD" uses standard deviation for the width of error bar.
-#' @param x.axis.size size of x-axis labeling for "Run" in Profile Plot and
+#' @param x.axis.size size of x-axis labeling for "Run" in Profile Plot and
#' QC Plot, and "Condition" in Condition Plot. Default is 10.
#' @param y.axis.size size of y-axis labels. Default is 10.
-#' @param text.size size of labels represented each condition at the top of
+#' @param text.size size of labels represented each condition at the top of
#' graph in Profile Plot and QC plot. Default is 4.
#' @param text.angle angle of labels represented each condition at the top
-#' of graph in Profile Plot and QC plot or x-axis labeling in Condition plot.
+#' of graph in Profile Plot and QC plot or x-axis labeling in Condition plot.
#' Default is 0.
#' @param legend.size size of feature legend (transition-level or peptide-level)
#' above graph in Profile Plot. Default is 7.
@@ -49,83 +49,83 @@
#' @param height height of the saved file in pixels. Default is 600 pixels.
#' @param which.Protein Protein list to draw plots. List can be names of Proteins
#' or order numbers of Proteins from levels(data$FeatureLevelData$PROTEIN).
-#' Default is "all", which generates all plots for each protein.
+#' Default is "all", which generates all plots for each protein.
#' For QC plot, "allonly" will generate one QC plot with all proteins.
#' @param originalPlot TRUE(default) draws original profile plots.
-#' @param summaryPlot TRUE(default) draws profile plots with
+#' @param summaryPlot TRUE(default) draws profile plots with
#' summarization for run levels.
-#' @param save_condition_plot_result TRUE saves the table with values
+#' @param save_condition_plot_result TRUE saves the table with values
#' using condition plots. Default is FALSE.
-#' @param remove_uninformative_feature_outlier It only works after users used
+#' @param remove_uninformative_feature_outlier It only works after users used
#' featureSubset="highQuality" in dataProcess. TRUE allows to remove
-#' 1) the features are flagged in the column, feature_quality="Uninformative"
-#' which are features with bad quality,
-#' 2) outliers that are flagged in the column, is_outlier=TRUE in Profile plots.
+#' 1) the features are flagged in the column, feature_quality="Uninformative"
+#' which are features with bad quality,
+#' 2) outliers that are flagged in the column, is_outlier=TRUE in Profile plots.
#' FALSE (default) shows all features and intensities in profile plots.
-#' @param address prefix for the filename that will store the results.
-#' @param isPlotly Parameter to use Plotly or ggplot2. If set to TRUE, MSstats
+#' @param address prefix for the filename that will store the results.
+#' @param isPlotly Parameter to use Plotly or ggplot2. If set to TRUE, MSstats
#' will save Plotly plots as HTML files. If set to FALSE MSstats will save ggplot2 plots
#' as PDF files
-#' Default folder is the current working directory.
+#' Default folder is the current working directory.
#' The other assigned folder has to be existed under the current working directory.
-#' An output pdf file is automatically created with the default name of
-#' "ProfilePlot.pdf" or "QCplot.pdf" or "ConditionPlot.pdf" or "ConditionPlot_value.csv".
-#' The command address can help to specify where to store the file as well as
-#' how to modify the beginning of the file name.
+#' An output pdf file is automatically created with the default name of
+#' "ProfilePlot.pdf" or "QCplot.pdf" or "ConditionPlot.pdf" or "ConditionPlot_value.csv".
+#' The command address can help to specify where to store the file as well as
+#' how to modify the beginning of the file name.
#' If address=FALSE, plot will be not saved as pdf file but showed in window.
-#'
+#'
#' @details
#' \itemize{
#' \item{Profile Plot : identify the potential sources of variation of each protein. QuantData$FeatureLevelData is used for plots. X-axis is run. Y-axis is log-intensities of transitions. Reference/endogenous signals are in the left/right panel. Line colors indicate peptides and line types indicate transitions. In summarization plots, gray dots and lines are the same as original profile plots with QuantData$FeatureLevelData. Dark dots and lines are for summarized intensities from QuantData$ProteinLevelData.}
#' \item{QC Plot : illustrate the systematic bias between MS runs. After normalization, the reference signals for all proteins should be stable across MS runs. QuantData$FeatureLevelData is used for plots. X-axis is run. Y-axis is log-intensities of transition. Reference/endogenous signals are in the left/right panel. The pdf file contains (1) QC plot for all proteins and (2) QC plots for each protein separately.}
#' \item{Condition Plot : illustrate the systematic difference between conditions. Summarized intensnties from QuantData$ProteinLevelData are used for plots. X-axis is condition. Y-axis is summarized log transformed intensity. If scale is TRUE, the levels of conditions is scaled according to its actual values at x-axis. Red points indicate the mean for each condition. If interval is "CI", blue error bars indicate the confidence interval with 0.95 significant level for each condition. If interval is "SD", blue error bars indicate the standard deviation for each condition.The interval is not related with model-based analysis.}
#' }
-#' The input of this function is the quantitative data from function \code{\link{dataProcess}}.
-#'
-#'
+#' The input of this function is the quantitative data from function \code{\link{dataProcess}}.
+#'
+#'
#' @import ggplot2
#' @importFrom graphics axis image legend mtext par plot.new title plot
#' @importFrom grDevices dev.off hcl pdf
#' @importFrom plotly ggplotly style add_trace plot_ly subplot
#' @importFrom htmltools tagList div save_html
-#'
+#'
#' @export
-#'
-#' @examples
-#' # Consider quantitative data (i.e. QuantData) from a yeast study with ten time points of interests,
-#' # three biological replicates, and no technical replicates which is a time-course experiment.
-#' # The goal is to provide pre-analysis visualization by automatically generate two types of figures
-#' # in two separate pdf files.
-#' # Protein IDHC (gene name IDP2) is differentially expressed in time point 1 and time point 7,
+#'
+#' @examples
+#' # Consider quantitative data (i.e. QuantData) from a yeast study with ten time points of interests,
+#' # three biological replicates, and no technical replicates which is a time-course experiment.
+#' # The goal is to provide pre-analysis visualization by automatically generate two types of figures
+#' # in two separate pdf files.
+#' # Protein IDHC (gene name IDP2) is differentially expressed in time point 1 and time point 7,
#' # whereas, Protein PMG2 (gene name GPM2) is not.
-#'
-#' QuantData<-dataProcess(SRMRawData, use_log_file = FALSE)
+#'
+#' QuantData <- dataProcess(SRMRawData, use_log_file = FALSE)
#' head(QuantData$FeatureLevelData)
#' # Profile plot
-#' dataProcessPlots(data=QuantData,type="ProfilePlot")
-#' # Quality control plot
-#' dataProcessPlots(data=QuantData,type="QCPlot")
+#' dataProcessPlots(data = QuantData, type = "ProfilePlot")
+#' # Quality control plot
+#' dataProcessPlots(data = QuantData, type = "QCPlot")
#' # Quantification plot for conditions
-#' dataProcessPlots(data=QuantData,type="ConditionPlot")
-#'
-dataProcessPlots = function(
- data, type, featureName = "Transition", ylimUp = FALSE, ylimDown = FALSE,
- scale = FALSE, interval = "CI", x.axis.size = 10, y.axis.size = 10,
- text.size = 4, text.angle = 0, legend.size = 7, dot.size.profile = 2,
- dot.size.condition = 3, width = 800, height = 600, which.Protein = "all",
- originalPlot = TRUE, summaryPlot = TRUE, save_condition_plot_result = FALSE,
- remove_uninformative_feature_outlier = FALSE, address = "", isPlotly = FALSE
-) {
- PROTEIN = Protein = NULL
-
- type = toupper(type)
- processed = data.table::as.data.table(data$FeatureLevelData)
- summarized = data.table::as.data.table(data$ProteinLevelData)
+#' dataProcessPlots(data = QuantData, type = "ConditionPlot")
+#'
+dataProcessPlots <- function(
+ data, type, featureName = "Transition", ylimUp = FALSE, ylimDown = FALSE,
+ scale = FALSE, interval = "CI", x.axis.size = 10, y.axis.size = 10,
+ text.size = 4, text.angle = 0, legend.size = 7, dot.size.profile = 2,
+ dot.size.condition = 3, width = 800, height = 600, which.Protein = "all",
+ originalPlot = TRUE, summaryPlot = TRUE, save_condition_plot_result = FALSE,
+ remove_uninformative_feature_outlier = FALSE, address = "", isPlotly = FALSE) {
+ PROTEIN <- Protein <- NULL
+
+ type <- toupper(type)
+ processed <- data.table::as.data.table(data$FeatureLevelData)
+ summarized <- data.table::as.data.table(data$ProteinLevelData)
processed[, PROTEIN := factor(PROTEIN)]
summarized[, Protein := factor(Protein)]
-
+
checkmate::assertChoice(type, c("PROFILEPLOT", "QCPLOT", "CONDITIONPLOT"),
- .var.name = "type")
+ .var.name = "type"
+ )
if (as.character(address) == "FALSE") {
if (which.Protein == "all") {
stop("** Cannnot generate all plots in a screen. Please set one protein at a time.")
@@ -133,94 +133,96 @@ dataProcessPlots = function(
stop("** Cannnot generate multiple plots in a screen. Please set one protein at a time.")
}
}
- warning("Avoid plotting all proteins as it can take a large amount of time
+ warning("Avoid plotting all proteins as it can take a large amount of time
to download the files")
- if(isPlotly & address != FALSE) {
- print("Plots will be saved as .HTML file as plotly is selected, set isPlotly = FALSE, if
+ if (isPlotly & address != FALSE) {
+ print("Plots will be saved as .HTML file as plotly is selected, set isPlotly = FALSE, if
you want to generate PDF using ggplot2")
}
-
+
if (type == "PROFILEPLOT") {
- plots <- .plotProfile(processed, summarized, featureName, ylimUp, ylimDown,
- x.axis.size, y.axis.size, text.size, text.angle, legend.size,
- dot.size.profile, width, height, which.Protein, originalPlot,
- summaryPlot, remove_uninformative_feature_outlier, address, isPlotly)
- plotly_plots = list()
- if(isPlotly) {
- og_plotly_plot = NULL
- summ_plotly_plot = NULL
- if("original_plot" %in% names(plots)) {
- for(i in seq_along(plots[["original_plot"]])) {
- plot_i <- plots[["original_plot"]][[paste("plot",i)]]
- og_plotly_plot <- .convertGgplot2Plotly(plot_i,tips=c("FEATURE","RUN","newABUNDANCE"))
- og_plotly_plot = .fixLegendPlotlyPlotsDataprocess(og_plotly_plot)
- og_plotly_plot = .fixCensoredPointsLegendProfilePlotsPlotly(og_plotly_plot)
-
- if(toupper(featureName) == "NA") {
- og_plotly_plot = .retainCensoredDataPoints(og_plotly_plot)
- }
- plotly_plots = c(plotly_plots, list(og_plotly_plot))
- }
- }
- if("summary_plot" %in% names(plots)) {
- for(i in seq_along(plots[["summary_plot"]])) {
- plot_i <- plots[["summary_plot"]][[paste("plot",i)]]
- summ_plotly_plot <- .convertGgplot2Plotly(plot_i,tips=c("FEATURE","RUN","ABUNDANCE"))
- summ_plotly_plot = .fixLegendPlotlyPlotsDataprocess(summ_plotly_plot)
- summ_plotly_plot = .fixCensoredPointsLegendProfilePlotsPlotly(summ_plotly_plot)
- if(toupper(featureName) == "NA") {
- summ_plotly_plot = .retainCensoredDataPoints(summ_plotly_plot)
- }
- plotly_plots = c(plotly_plots, list(summ_plotly_plot))
- }
- }
-
- if(address != FALSE) {
- .savePlotlyPlotHTML(plotly_plots,address,"ProfilePlot" ,width, height)
+ plots <- .plotProfile(
+ processed, summarized, featureName, ylimUp, ylimDown,
+ x.axis.size, y.axis.size, text.size, text.angle, legend.size,
+ dot.size.profile, width, height, which.Protein, originalPlot,
+ summaryPlot, remove_uninformative_feature_outlier, address, isPlotly
+ )
+ plotly_plots <- list()
+ if (isPlotly) {
+ og_plotly_plot <- NULL
+ summ_plotly_plot <- NULL
+ if ("original_plot" %in% names(plots)) {
+ for (i in seq_along(plots[["original_plot"]])) {
+ plot_i <- plots[["original_plot"]][[paste("plot", i)]]
+ og_plotly_plot <- .convertGgplot2Plotly(plot_i, tips = c("FEATURE", "RUN", "newABUNDANCE"))
+ og_plotly_plot <- .fixLegendPlotlyPlotsDataprocess(og_plotly_plot)
+ og_plotly_plot <- .fixCensoredPointsLegendProfilePlotsPlotly(og_plotly_plot)
+
+ if (toupper(featureName) == "NA") {
+ og_plotly_plot <- .retainCensoredDataPoints(og_plotly_plot)
}
- plotly_plots
+ plotly_plots <- c(plotly_plots, list(og_plotly_plot))
+ }
}
- }
-
- else if (type == "QCPLOT") {
- plots <- .plotQC(processed, featureName, ylimUp, ylimDown, x.axis.size, y.axis.size,
- text.size, text.angle, legend.size, dot.size.profile, width, height,
- which.Protein, address, isPlotly)
- plotly_plots <- vector("list", length(plots))
- if(isPlotly) {
- for(i in seq_along(plots)) {
- plot <- plots[[i]]
- plotly_plot <- .convertGgplot2Plotly(plot)
- plotly_plot = .fixLegendPlotlyPlotsDataprocess(plotly_plot)
- plotly_plots[[i]] = list(plotly_plot)
+ if ("summary_plot" %in% names(plots)) {
+ for (i in seq_along(plots[["summary_plot"]])) {
+ plot_i <- plots[["summary_plot"]][[paste("plot", i)]]
+ summ_plotly_plot <- .convertGgplot2Plotly(plot_i, tips = c("FEATURE", "RUN", "ABUNDANCE"))
+ summ_plotly_plot <- .fixLegendPlotlyPlotsDataprocess(summ_plotly_plot)
+ summ_plotly_plot <- .fixCensoredPointsLegendProfilePlotsPlotly(summ_plotly_plot)
+ if (toupper(featureName) == "NA") {
+ summ_plotly_plot <- .retainCensoredDataPoints(summ_plotly_plot)
}
- if(address != FALSE) {
- .savePlotlyPlotHTML(plotly_plots,address,"QCPlot" ,width, height)
- }
- plotly_plots <- unlist(plotly_plots, recursive = FALSE)
- plotly_plots
+ plotly_plots <- c(plotly_plots, list(summ_plotly_plot))
+ }
}
- }
-
- else if (type == "CONDITIONPLOT") {
- plots <- .plotCondition(processed, summarized, ylimUp, ylimDown, scale, interval,
- x.axis.size, y.axis.size, text.size, text.angle, legend.size,
- dot.size.profile, dot.size.condition, width, height,
- which.Protein, save_condition_plot_result, address, isPlotly)
- plotly_plots <- vector("list", length(plots))
- if(isPlotly) {
- for(i in seq_along(plots)) {
- plot <- plots[[i]]
- plotly_plot <- .convertGgplot2Plotly(plot)
- plotly_plot = .fixLegendPlotlyPlotsDataprocess(plotly_plot)
- plotly_plots[[i]] = list(plotly_plot)
- }
- if(address != FALSE) {
- .savePlotlyPlotHTML(plotly_plots,address,"ConditionPlot" ,width, height)
- }
- plotly_plots <- unlist(plotly_plots, recursive = FALSE)
- plotly_plots
+
+ if (address != FALSE) {
+ .savePlotlyPlotHTML(plotly_plots, address, "ProfilePlot", width, height)
+ }
+ plotly_plots
+ }
+ } else if (type == "QCPLOT") {
+ plots <- .plotQC(
+ processed, featureName, ylimUp, ylimDown, x.axis.size, y.axis.size,
+ text.size, text.angle, legend.size, dot.size.profile, width, height,
+ which.Protein, address, isPlotly
+ )
+ plotly_plots <- vector("list", length(plots))
+ if (isPlotly) {
+ for (i in seq_along(plots)) {
+ plot <- plots[[i]]
+ plotly_plot <- .convertGgplot2Plotly(plot)
+ plotly_plot <- .fixLegendPlotlyPlotsDataprocess(plotly_plot)
+ plotly_plots[[i]] <- list(plotly_plot)
+ }
+ if (address != FALSE) {
+ .savePlotlyPlotHTML(plotly_plots, address, "QCPlot", width, height)
+ }
+ plotly_plots <- unlist(plotly_plots, recursive = FALSE)
+ plotly_plots
+ }
+ } else if (type == "CONDITIONPLOT") {
+ plots <- .plotCondition(
+ processed, summarized, ylimUp, ylimDown, scale, interval,
+ x.axis.size, y.axis.size, text.size, text.angle, legend.size,
+ dot.size.profile, dot.size.condition, width, height,
+ which.Protein, save_condition_plot_result, address, isPlotly
+ )
+ plotly_plots <- vector("list", length(plots))
+ if (isPlotly) {
+ for (i in seq_along(plots)) {
+ plot <- plots[[i]]
+ plotly_plot <- .convertGgplot2Plotly(plot)
+ plotly_plot <- .fixLegendPlotlyPlotsDataprocess(plotly_plot)
+ plotly_plots[[i]] <- list(plotly_plot)
}
+ if (address != FALSE) {
+ .savePlotlyPlotHTML(plotly_plots, address, "ConditionPlot", width, height)
+ }
+ plotly_plots <- unlist(plotly_plots, recursive = FALSE)
+ plotly_plots
+ }
}
}
@@ -228,69 +230,76 @@ dataProcessPlots = function(
#' @importFrom utils setTxtProgressBar
#' @importFrom stats xtabs
#' @keywords internal
-.plotProfile = function(
- processed, summarized, featureName, ylimUp, ylimDown, x.axis.size, y.axis.size,
- text.size, text.angle, legend.size, dot.size.profile, width, height, proteins,
- originalPlot, summaryPlot, remove_uninformative_feature_outlier, address, isPlotly
-) {
- ABUNDANCE = PROTEIN = feature_quality = is_outlier = Protein = GROUP = NULL
- SUBJECT = LABEL = RUN = xtabs = PEPTIDE = FEATURE = NULL
- LogIntensities = TRANSITION = FRACTION = censored = analysis = NULL
-
- yaxis.name = .getYaxis(processed)
- is_censored = is.element("censored", colnames(processed))
- all_proteins = as.character(unique(processed$PROTEIN))
+.plotProfile <- function(
+ processed, summarized, featureName, ylimUp, ylimDown, x.axis.size, y.axis.size,
+ text.size, text.angle, legend.size, dot.size.profile, width, height, proteins,
+ originalPlot, summaryPlot, remove_uninformative_feature_outlier, address, isPlotly) {
+ ABUNDANCE <- PROTEIN <- feature_quality <- is_outlier <- Protein <- GROUP <- NULL
+ SUBJECT <- LABEL <- RUN <- xtabs <- PEPTIDE <- FEATURE <- NULL
+ LogIntensities <- TRANSITION <- FRACTION <- censored <- analysis <- NULL
+
+ yaxis.name <- .getYaxis(processed)
+ is_censored <- is.element("censored", colnames(processed))
+ all_proteins <- as.character(unique(processed$PROTEIN))
if (remove_uninformative_feature_outlier) {
if (is.element("feature_quality", colnames(processed))) {
processed[, ABUNDANCE := ifelse(
- feature_quality == "Noninformative" | is_outlier, NA, ABUNDANCE)]
- msg = "** Filtered out uninformative feature and outliers in the profile plots."
+ feature_quality == "Noninformative" | is_outlier, NA, ABUNDANCE
+ )]
+ msg <- "** Filtered out uninformative feature and outliers in the profile plots."
} else {
- msg = "** To remove uninformative features or outliers, please use \"featureSubset == \"highQuality\" option in \"dataProcess\" function."
+ msg <- "** To remove uninformative features or outliers, please use \"featureSubset == \"highQuality\" option in \"dataProcess\" function."
}
getOption("MSstatsMsg")("INFO", msg)
}
-
- processed = processed[order(GROUP, SUBJECT, LABEL)]
- processed[, RUN := factor(RUN, levels = unique(RUN),
- labels = seq(1, length(unique(RUN))))]
+
+ processed <- processed[order(GROUP, SUBJECT, LABEL)]
+ processed[, RUN := factor(RUN,
+ levels = unique(RUN),
+ labels = seq(1, length(unique(RUN)))
+ )]
processed[, RUN := as.numeric(RUN)]
-
- summarized = summarized[order(GROUP, SUBJECT)]
- summarized[, RUN := factor(RUN, levels = unique(RUN),
- labels = seq(1, length(unique(RUN))))]
+
+ summarized <- summarized[order(GROUP, SUBJECT)]
+ summarized[, RUN := factor(RUN,
+ levels = unique(RUN),
+ labels = seq(1, length(unique(RUN)))
+ )]
summarized[, RUN := as.numeric(RUN)]
-
+
## Meena :due to GROUP=0 for labeled.. extra care required.
- tempGroupName = unique(processed[, c("GROUP", "RUN")])
+ tempGroupName <- unique(processed[, c("GROUP", "RUN")])
if (length(unique(processed$LABEL)) == 2) {
- tempGroupName = tempGroupName[GROUP != '0']
- }
- tempGroupName = tempGroupName[order(RUN), ] ## Meena : should we order by GROUP or RUN? I guess by RUn, because x-axis is by RUN
- level.group = as.character(unique(tempGroupName$GROUP))
- tempGroupName$GROUP = factor(tempGroupName$GROUP,
- levels = level.group) ## Meena : factor GROUP again, due to 1, 10, 2, ... if you have better way, please change
-
- groupAxis = as.numeric(xtabs(~GROUP, tempGroupName))
- cumGroupAxis = cumsum(groupAxis)
- lineNameAxis = cumGroupAxis[-nlevels(tempGroupName$GROUP)]
+ tempGroupName <- tempGroupName[GROUP != "0"]
+ }
+ tempGroupName <- tempGroupName[order(RUN), ] ## Meena : should we order by GROUP or RUN? I guess by RUn, because x-axis is by RUN
+ level.group <- as.character(unique(tempGroupName$GROUP))
+ tempGroupName$GROUP <- factor(tempGroupName$GROUP,
+ levels = level.group
+ ) ## Meena : factor GROUP again, due to 1, 10, 2, ... if you have better way, please change
+
+ groupAxis <- as.numeric(xtabs(~GROUP, tempGroupName))
+ cumGroupAxis <- cumsum(groupAxis)
+ lineNameAxis <- cumGroupAxis[-nlevels(tempGroupName$GROUP)]
if (proteins != "all") {
- selected_proteins = getSelectedProteins(proteins, all_proteins)
- processed = processed[PROTEIN %in% selected_proteins]
- summarized = summarized[Protein %in% selected_proteins]
- processed[, PROTEIN := factor(PROTEIN)]
- summarized[, PROTEIN := factor(Protein)]
+ selected_proteins <- getSelectedProteins(proteins, all_proteins)
+ processed <- processed[PROTEIN %in% selected_proteins]
+ summarized <- summarized[Protein %in% selected_proteins]
+ processed[, PROTEIN := factor(PROTEIN)]
+ summarized[, PROTEIN := factor(Protein)]
}
-
- y.limup = ifelse(is.numeric(ylimUp), ylimUp, ceiling(max(processed$ABUNDANCE, na.rm = TRUE) + 3))
- y.limdown = ifelse(is.numeric(ylimDown), ylimDown, -1)
-
- groupName = data.frame(RUN = c(0, lineNameAxis) + groupAxis / 2 + 0.5,
- ABUNDANCE = rep(y.limup - 1, length(groupAxis)),
- Name = levels(tempGroupName$GROUP))
-
-
+
+ y.limup <- ifelse(is.numeric(ylimUp), ylimUp, ceiling(max(processed$ABUNDANCE, na.rm = TRUE) + 3))
+ y.limdown <- ifelse(is.numeric(ylimDown), ylimDown, -1)
+
+ groupName <- data.frame(
+ RUN = c(0, lineNameAxis) + groupAxis / 2 + 0.5,
+ ABUNDANCE = rep(y.limup - 1, length(groupAxis)),
+ Name = levels(tempGroupName$GROUP)
+ )
+
+
if (length(unique(processed$LABEL)) == 2) {
processed[, LABEL := factor(LABEL, labels = c("Reference", "Endogenous"))]
} else {
@@ -300,7 +309,7 @@ dataProcessPlots = function(
processed[, LABEL := factor(LABEL, labels = c("Reference"))]
}
}
-
+
if ("feature_quality" %in% colnames(processed)) {
processed[, feature_quality := NULL]
}
@@ -308,255 +317,277 @@ dataProcessPlots = function(
processed[, is_outlier := NULL]
}
output_plots <- list()
- output_plots[["original_plot"]] = list()
- output_plots[["summary_plot"]] = list()
- all_proteins = levels(processed$PROTEIN)
+ output_plots[["original_plot"]] <- list()
+ output_plots[["summary_plot"]] <- list()
+ all_proteins <- levels(processed$PROTEIN)
if (originalPlot) {
- if(!isPlotly) {
- savePlot(address, "ProfilePlot", width, height)
+ if (!isPlotly) {
+ savePlot(address, "ProfilePlot", width, height)
}
- pb = utils::txtProgressBar(min = 0, max = length(all_proteins), style = 3)
+ pb <- utils::txtProgressBar(min = 0, max = length(all_proteins), style = 3)
for (i in seq_along(all_proteins)) {
- single_protein = .getSingleProteinForProfile(processed, all_proteins, i)
+ single_protein <- .getSingleProteinForProfile(processed, all_proteins, i)
if (all(is.na(single_protein$ABUNDANCE))) {
next()
}
-
- pept_feat = unique(single_protein[, list(PEPTIDE, FEATURE)])
- counts = pept_feat[, list(N = .N), by = "PEPTIDE"]$N
- s = rep(seq_along(counts), times = counts)
- ss = unlist(lapply(counts, function(x) seq(1, x)), FALSE, FALSE)
- groupNametemp = data.frame(groupName,
- "FEATURE" = unique(single_protein$FEATURE)[1],
- "PEPTIDE" = unique(single_protein$PEPTIDE)[1])
-
- dot_colors = c("#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
- check_length = length(unique(s)) %/% length(dot_colors)
+
+ pept_feat <- unique(single_protein[, list(PEPTIDE, FEATURE)])
+ counts <- pept_feat[, list(N = .N), by = "PEPTIDE"]$N
+ s <- rep(seq_along(counts), times = counts)
+ ss <- unlist(lapply(counts, function(x) seq(1, x)), FALSE, FALSE)
+ groupNametemp <- data.frame(groupName,
+ "FEATURE" = unique(single_protein$FEATURE)[1],
+ "PEPTIDE" = unique(single_protein$PEPTIDE)[1]
+ )
+
+ dot_colors <- c("#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
+ check_length <- length(unique(s)) %/% length(dot_colors)
if (check_length > 0) {
- dot_colors = rep(dot_colors, times = check_length + 1)
+ dot_colors <- rep(dot_colors, times = check_length + 1)
}
-
- profile_plot = .makeProfilePlot(single_protein, is_censored, featureName,
- y.limdown, y.limup,
- x.axis.size, y.axis.size,
- text.size, text.angle,
- legend.size, dot.size.profile,
- ss, s, cumGroupAxis, yaxis.name,
- lineNameAxis, groupNametemp, dot_colors)
-
+
+ profile_plot <- .makeProfilePlot(
+ single_protein, is_censored, featureName,
+ y.limdown, y.limup,
+ x.axis.size, y.axis.size,
+ text.size, text.angle,
+ legend.size, dot.size.profile,
+ ss, s, cumGroupAxis, yaxis.name,
+ lineNameAxis, groupNametemp, dot_colors
+ )
+
setTxtProgressBar(pb, i)
print(profile_plot)
- output_plots[["original_plot"]][[paste("plot",i)]] <- profile_plot
+ output_plots[["original_plot"]][[paste("plot", i)]] <- profile_plot
}
-
+
close(pb)
-
+
if (address != FALSE & !isPlotly) {
dev.off()
- }
+ }
}
-
+
if (summaryPlot) {
- protein_by_run = expand.grid(Protein = unique(summarized$Protein),
- RUN = unique(summarized$RUN))
- summarized = merge(summarized, protein_by_run, by = c("Protein", "RUN"),
- all.x = TRUE, all.y = TRUE)
- if(!isPlotly) {
- savePlot(address, "ProfilePlot_wSummarization", width, height)
+ protein_by_run <- expand.grid(
+ Protein = unique(summarized$Protein),
+ RUN = unique(summarized$RUN)
+ )
+ summarized <- merge(summarized, protein_by_run,
+ by = c("Protein", "RUN"),
+ all.x = TRUE, all.y = TRUE
+ )
+ if (!isPlotly) {
+ savePlot(address, "ProfilePlot_wSummarization", width, height)
}
- pb = utils::txtProgressBar(min = 0, max = length(all_proteins), style = 3)
+ pb <- utils::txtProgressBar(min = 0, max = length(all_proteins), style = 3)
for (i in seq_along(all_proteins)) {
- single_protein = .getSingleProteinForProfile(processed, all_proteins, i)
+ single_protein <- .getSingleProteinForProfile(processed, all_proteins, i)
if (all(is.na(single_protein$ABUNDANCE))) {
next()
}
-
- pept_feat = unique(single_protein[, list(PEPTIDE, FEATURE)])
- counts = pept_feat[, list(N = .N), by = "PEPTIDE"]$N
- s = rep(seq_along(counts), times = counts)
- ss = unlist(lapply(counts, function(x) seq(1, x)), FALSE, FALSE)
- groupNametemp = data.frame(groupName,
- FEATURE = unique(single_protein$FEATURE)[1],
- analysis = "Run summary")
-
- single_protein_summ = summarized[Protein == all_proteins[i], ]
- quant = single_protein_summ[
+
+ pept_feat <- unique(single_protein[, list(PEPTIDE, FEATURE)])
+ counts <- pept_feat[, list(N = .N), by = "PEPTIDE"]$N
+ s <- rep(seq_along(counts), times = counts)
+ ss <- unlist(lapply(counts, function(x) seq(1, x)), FALSE, FALSE)
+ groupNametemp <- data.frame(groupName,
+ FEATURE = unique(single_protein$FEATURE)[1],
+ analysis = "Run summary"
+ )
+
+ single_protein_summ <- summarized[Protein == all_proteins[i], ]
+ quant <- single_protein_summ[
Protein == all_proteins[i],
- list(PROTEIN = unique(Protein), PEPTIDE = "Run summary",
- TRANSITION = "Run summary", FEATURE = "Run summary",
- LABEL = "Endogenous", RUN = RUN,
- ABUNDANCE = LogIntensities, FRACTION = 1)
- ]
+ list(
+ PROTEIN = unique(Protein), PEPTIDE = "Run summary",
+ TRANSITION = "Run summary", FEATURE = "Run summary",
+ LABEL = "Endogenous", RUN = RUN,
+ ABUNDANCE = LogIntensities, FRACTION = 1
+ )
+ ]
if (is_censored) {
- quant$censored = FALSE
+ quant$censored <- FALSE
}
- quant$analysis = "Run summary"
- quant$newABUNDANCE = quant$ABUNDANCE
- single_protein$analysis = "Processed feature-level data"
- combined = rbind(single_protein[
- ,
- list(PROTEIN, PEPTIDE, TRANSITION, FEATURE, LABEL,
- RUN, ABUNDANCE, newABUNDANCE,FRACTION, censored, analysis)], quant,fill=TRUE)
- combined$analysis = factor(combined$analysis)
- combined$FEATURE = factor(combined$FEATURE)
- combined$RUN = as.numeric(combined$RUN)
- profile_plot = .makeSummaryProfilePlot(
- combined, is_censored, y.limdown, y.limup, x.axis.size, y.axis.size,
- text.size, text.angle, legend.size, dot.size.profile, cumGroupAxis,
+ quant$analysis <- "Run summary"
+ quant$newABUNDANCE <- quant$ABUNDANCE
+ single_protein$analysis <- "Processed feature-level data"
+ combined <- rbind(single_protein[
+ ,
+ list(
+ PROTEIN, PEPTIDE, TRANSITION, FEATURE, LABEL,
+ RUN, ABUNDANCE, newABUNDANCE, FRACTION, censored, analysis
+ )
+ ], quant, fill = TRUE)
+ combined$analysis <- factor(combined$analysis)
+ combined$FEATURE <- factor(combined$FEATURE)
+ combined$RUN <- as.numeric(combined$RUN)
+ profile_plot <- .makeSummaryProfilePlot(
+ combined, is_censored, y.limdown, y.limup, x.axis.size, y.axis.size,
+ text.size, text.angle, legend.size, dot.size.profile, cumGroupAxis,
yaxis.name, lineNameAxis, groupNametemp
)
print(profile_plot)
setTxtProgressBar(pb, i)
- output_plots[["summary_plot"]][[paste("plot",i)]] <- profile_plot
-
+ output_plots[["summary_plot"]][[paste("plot", i)]] <- profile_plot
}
close(pb)
-
+
if (address != FALSE & !isPlotly) {
dev.off()
- }
+ }
}
- if(isPlotly) {
- output_plots
+ if (isPlotly) {
+ output_plots
}
-
}
#' @importFrom stats xtabs
#' @importFrom utils setTxtProgressBar
-.plotQC = function(
- processed, featureName, ylimUp, ylimDown, x.axis.size, y.axis.size, text.size,
- text.angle, legend.size, dot.size.profile, width, height, protein, address, isPlotly
-) {
- GROUP = SUBJECT = RUN = LABEL = PROTEIN = NULL
-
- yaxis.name = .getYaxis(processed)
- y.limup = ifelse(is.numeric(ylimUp), ylimUp, ceiling(max(processed$ABUNDANCE, na.rm = TRUE) + 3))
- y.limdown = ifelse(is.numeric(ylimDown), ylimDown, -1)
-
- processed = processed[order(GROUP, SUBJECT), ]
- processed[, RUN := factor(RUN, levels = unique(processed$RUN),
- labels = seq(1, data.table::uniqueN(processed$RUN)))]
-
+.plotQC <- function(
+ processed, featureName, ylimUp, ylimDown, x.axis.size, y.axis.size, text.size,
+ text.angle, legend.size, dot.size.profile, width, height, protein, address, isPlotly) {
+ GROUP <- SUBJECT <- RUN <- LABEL <- PROTEIN <- NULL
+
+ yaxis.name <- .getYaxis(processed)
+ y.limup <- ifelse(is.numeric(ylimUp), ylimUp, ceiling(max(processed$ABUNDANCE, na.rm = TRUE) + 3))
+ y.limdown <- ifelse(is.numeric(ylimDown), ylimDown, -1)
+
+ processed <- processed[order(GROUP, SUBJECT), ]
+ processed[, RUN := factor(RUN,
+ levels = unique(processed$RUN),
+ labels = seq(1, data.table::uniqueN(processed$RUN))
+ )]
+
if (length(unique(processed$LABEL)) == 2) {
processed[, LABEL := factor(LABEL, labels = c("Reference", "Endogenous"))]
- label.color = c("darkseagreen1", "lightblue")
+ label.color <- c("darkseagreen1", "lightblue")
} else {
if (unique(processed$LABEL) == "L") {
processed[, LABEL := factor(LABEL, labels = c("Endogenous"))]
- label.color = c("lightblue")
+ label.color <- c("lightblue")
} else {
processed[, LABEL := factor(LABEL, labels = c("Reference"))]
- label.color = c("darkseagreen1")
+ label.color <- c("darkseagreen1")
}
}
-
- processed = processed[order(LABEL, GROUP, SUBJECT)]
-
+
+ processed <- processed[order(LABEL, GROUP, SUBJECT)]
+
## Meena :due to GROUP=0 for labeled.. extra care required.
- tempGroupName = unique(processed[, list(GROUP, RUN)])
+ tempGroupName <- unique(processed[, list(GROUP, RUN)])
if (length(unique(processed$LABEL)) == 2) {
- tempGroupName = tempGroupName[GROUP != '0']
- }
- tempGroupName = tempGroupName[order(RUN), ] ## Meena : should we order by GROUP or RUN? I guess by RUn, because x-axis is by RUN
- level.group = as.character(unique(tempGroupName$GROUP))
- tempGroupName$GROUP = factor(tempGroupName$GROUP,
- levels = level.group) ## Meena : factor GROUP again, due to 1, 10, 2, ... if you have better way, please change
-
- groupAxis = as.numeric(xtabs(~GROUP, tempGroupName))
- cumGroupAxis = cumsum(groupAxis)
- lineNameAxis = cumGroupAxis[-nlevels(tempGroupName$GROUP)]
- groupName = data.frame(RUN = c(0, lineNameAxis) + groupAxis / 2 + 0.5,
- ABUNDANCE = rep(y.limup - 1, length(groupAxis)),
- Name = levels(tempGroupName$GROUP))
+ tempGroupName <- tempGroupName[GROUP != "0"]
+ }
+ tempGroupName <- tempGroupName[order(RUN), ] ## Meena : should we order by GROUP or RUN? I guess by RUn, because x-axis is by RUN
+ level.group <- as.character(unique(tempGroupName$GROUP))
+ tempGroupName$GROUP <- factor(tempGroupName$GROUP,
+ levels = level.group
+ ) ## Meena : factor GROUP again, due to 1, 10, 2, ... if you have better way, please change
+
+ groupAxis <- as.numeric(xtabs(~GROUP, tempGroupName))
+ cumGroupAxis <- cumsum(groupAxis)
+ lineNameAxis <- cumGroupAxis[-nlevels(tempGroupName$GROUP)]
+ groupName <- data.frame(
+ RUN = c(0, lineNameAxis) + groupAxis / 2 + 0.5,
+ ABUNDANCE = rep(y.limup - 1, length(groupAxis)),
+ Name = levels(tempGroupName$GROUP)
+ )
if (!isPlotly) {
- savePlot(address, "QCPlot", width, height)
+ savePlot(address, "QCPlot", width, height)
}
- all_proteins = as.character(levels(processed$PROTEIN))
+ all_proteins <- as.character(levels(processed$PROTEIN))
plots <- vector("list", length(all_proteins) + 1) # +1 for all/allonly plot
if (protein %in% c("all", "allonly")) {
- qc_plot = .makeQCPlot(processed, TRUE, y.limdown, y.limup, x.axis.size,
- y.axis.size, text.size, text.angle, legend.size,
- label.color, cumGroupAxis, groupName, lineNameAxis,
- yaxis.name)
+ qc_plot <- .makeQCPlot(
+ processed, TRUE, y.limdown, y.limup, x.axis.size,
+ y.axis.size, text.size, text.angle, legend.size,
+ label.color, cumGroupAxis, groupName, lineNameAxis,
+ yaxis.name
+ )
print(qc_plot)
- plots[[1]] = qc_plot
- }
-
+ plots[[1]] <- qc_plot
+ }
+
if (protein != "allonly") {
- all_proteins = as.character(levels(processed$PROTEIN))
+ all_proteins <- as.character(levels(processed$PROTEIN))
if (protein != "all") {
- selected_proteins = getSelectedProteins(protein, all_proteins)
- processed = processed[PROTEIN %in% selected_proteins]
+ selected_proteins <- getSelectedProteins(protein, all_proteins)
+ processed <- processed[PROTEIN %in% selected_proteins]
processed[, PROTEIN := factor(PROTEIN)]
}
- pb = utils::txtProgressBar(min = 0, max = length(all_proteins), style = 3)
- for (i in seq_along(all_proteins)) {
-
- single_protein = processed[processed$PROTEIN == all_proteins[i], ]
- single_protein = single_protein[order(LABEL, RUN)]
+ pb <- utils::txtProgressBar(min = 0, max = length(all_proteins), style = 3)
+ for (i in seq_along(all_proteins)) {
+ single_protein <- processed[processed$PROTEIN == all_proteins[i], ]
+ single_protein <- single_protein[order(LABEL, RUN)]
if (all(is.na(single_protein$ABUNDANCE))) {
next()
}
- qc_plot = .makeQCPlot(single_protein, FALSE, y.limdown, y.limup,
- x.axis.size, y.axis.size, text.size, text.angle,
- legend.size, label.color, cumGroupAxis, groupName,
- lineNameAxis, yaxis.name)
+ qc_plot <- .makeQCPlot(
+ single_protein, FALSE, y.limdown, y.limup,
+ x.axis.size, y.axis.size, text.size, text.angle,
+ legend.size, label.color, cumGroupAxis, groupName,
+ lineNameAxis, yaxis.name
+ )
print(qc_plot)
- plots[[i+1]] = qc_plot # to accomodate all proteins
+ plots[[i + 1]] <- qc_plot # to accomodate all proteins
setTxtProgressBar(pb, i)
- }
+ }
close(pb)
- }
+ }
if (address != FALSE) {
dev.off()
}
if (isPlotly) {
- plots <- Filter(function(x) !is.null(x), plots) # remove if protein was not "all"
- plots
+ plots <- Filter(function(x) !is.null(x), plots) # remove if protein was not "all"
+ plots
}
-}
+}
#' @importFrom stats qt sd na.omit
#' @importFrom utils setTxtProgressBar
#' @keywords internal
-.plotCondition = function(
- processed, summarized, ylimUp, ylimDown, scale, interval, x.axis.size,
- y.axis.size, text.size, text.angle, legend.size, dot.size.profile,
- dot.size.condition, width, height, protein, save_plot, address, isPlotly
-) {
- adj.pvalue = Protein = ciw = PROTEIN = GROUP = SUBJECT = ABUNDANCE = NULL
-
- data.table::setnames(summarized, c("Protein", "LogIntensities"),
- c("PROTEIN", "ABUNDANCE"))
- all_proteins = levels(summarized$PROTEIN)
+.plotCondition <- function(
+ processed, summarized, ylimUp, ylimDown, scale, interval, x.axis.size,
+ y.axis.size, text.size, text.angle, legend.size, dot.size.profile,
+ dot.size.condition, width, height, protein, save_plot, address, isPlotly) {
+ adj.pvalue <- Protein <- ciw <- PROTEIN <- GROUP <- SUBJECT <- ABUNDANCE <- NULL
+
+ data.table::setnames(
+ summarized, c("Protein", "LogIntensities"),
+ c("PROTEIN", "ABUNDANCE")
+ )
+ all_proteins <- levels(summarized$PROTEIN)
if (protein != "all") {
- proteins = getSelectedProteins(protein, all_proteins)
- summarized = summarized[PROTEIN %in% proteins, ]
+ proteins <- getSelectedProteins(protein, all_proteins)
+ summarized <- summarized[PROTEIN %in% proteins, ]
summarized[, PROTEIN := factor(PROTEIN)]
}
- yaxis.name = .getYaxis(processed)
-
- results = vector("list", length(all_proteins))
- if(!isPlotly) {
- savePlot(address, "ConditionPlot", width, height)
+ yaxis.name <- .getYaxis(processed)
+
+ results <- vector("list", length(all_proteins))
+ if (!isPlotly) {
+ savePlot(address, "ConditionPlot", width, height)
}
plots <- vector("list", length(all_proteins))
- pb = utils::txtProgressBar(min = 0, max = length(all_proteins), style = 3)
+ pb <- utils::txtProgressBar(min = 0, max = length(all_proteins), style = 3)
for (i in seq_along(all_proteins)) {
- single_protein = summarized[PROTEIN == all_proteins[i], ]
- single_protein = na.omit(single_protein)
+ single_protein <- summarized[PROTEIN == all_proteins[i], ]
+ single_protein <- na.omit(single_protein)
single_protein[, GROUP := factor(GROUP)]
single_protein[, SUBJECT := factor(SUBJECT)]
if (all(is.na(single_protein$ABUNDANCE))) {
next()
}
- sp_all = single_protein[, list(Mean = mean(ABUNDANCE, na.rm = TRUE),
- SD = sd(ABUNDANCE, na.rm = TRUE),
- numMeasurement = .N),
- by = "GROUP"]
+ sp_all <- single_protein[, list(
+ Mean = mean(ABUNDANCE, na.rm = TRUE),
+ SD = sd(ABUNDANCE, na.rm = TRUE),
+ numMeasurement = .N
+ ),
+ by = "GROUP"
+ ]
if (interval == "CI") {
sp_all[, ciw := qt(0.975, sp_all$numMeasurement) * sp_all$SD / sqrt(sp_all$numMeasurement)]
} else {
@@ -565,31 +596,33 @@ dataProcessPlots = function(
if (sum(is.na(sp_all$ciw)) >= 1) {
sp_all[is.na(sp_all$ciw), ciw := 0]
}
-
- y.limup = ifelse(is.numeric(ylimUp), ylimUp, ceiling(max(sp_all$Mean + sp_all$ciw)))
- y.limdown = ifelse(is.numeric(ylimDown), ylimDown, floor(min(sp_all$Mean - sp_all$ciw)))
-
- sp_all = sp_all[order(GROUP), ]
- sp_all$Protein = all_proteins[i]
+
+ y.limup <- ifelse(is.numeric(ylimUp), ylimUp, ceiling(max(sp_all$Mean + sp_all$ciw)))
+ y.limdown <- ifelse(is.numeric(ylimDown), ylimDown, floor(min(sp_all$Mean - sp_all$ciw)))
+
+ sp_all <- sp_all[order(GROUP), ]
+ sp_all$Protein <- all_proteins[i]
if (save_plot) {
- results[[i]] = sp_all
+ results[[i]] <- sp_all
}
- con_plot = .makeConditionPlot(sp_all, scale, single_protein, y.limdown,
- y.limup, x.axis.size, y.axis.size,
- text.size, text.angle, legend.size,
- dot.size.condition, yaxis.name)
+ con_plot <- .makeConditionPlot(
+ sp_all, scale, single_protein, y.limdown,
+ y.limup, x.axis.size, y.axis.size,
+ text.size, text.angle, legend.size,
+ dot.size.condition, yaxis.name
+ )
print(con_plot)
- plots[[i]] = con_plot
+ plots[[i]] <- con_plot
setTxtProgressBar(pb, i)
}
close(pb)
-
+
if (address != FALSE & !isPlotly) {
dev.off()
}
-
+
if (save_plot) {
- result = data.table::rbindlist(results)
+ result <- data.table::rbindlist(results)
data.table::setnames(result, "GROUP", "Condition")
if (interval == "CI") {
data.table::setnames(result, "ciw", "95% CI")
@@ -599,154 +632,153 @@ dataProcessPlots = function(
.saveTable(result, address, "ConditionPlot_value")
}
if (isPlotly) {
- plots
+ plots
}
}
#' converter for plots from ggplot to plotly
#' @noRd
-.convertGgplot2Plotly = function(plot, tips = "all") {
- converted_plot <- ggplotly(plot,tooltip = tips)
- converted_plot <- plotly::layout(
- converted_plot,
- width = 800, # Set the width of the chart in pixels
- height = 600, # Set the height of the chart in pixels
- title = list(
- font = list(
- size = 18
- )
- ),
- xaxis = list(
- titlefont = list(
- size = 15 # Set the font size for the x-axis label
- )
- ),
- legend = list(
- x = 0, # Set the x position of the legend
- y = -0.25, # Set the y position of the legend (negative value to move below the plot)
- orientation = "h", # Horizontal orientation
- font = list(
- size = 12 # Set the font size for legend item labels
- ),
- title = list(
- font = list(
- size = 12 # Set the font size for the legend title
- )
- )
- )
- )
- converted_plot
+.convertGgplot2Plotly <- function(plot, tips = "all") {
+ converted_plot <- ggplotly(plot, tooltip = tips)
+ converted_plot <- plotly::layout(
+ converted_plot,
+ width = 800, # Set the width of the chart in pixels
+ height = 600, # Set the height of the chart in pixels
+ title = list(
+ font = list(
+ size = 18
+ )
+ ),
+ xaxis = list(
+ titlefont = list(
+ size = 15 # Set the font size for the x-axis label
+ )
+ ),
+ legend = list(
+ x = 0, # Set the x position of the legend
+ y = -0.25, # Set the y position of the legend (negative value to move below the plot)
+ orientation = "h", # Horizontal orientation
+ font = list(
+ size = 12 # Set the font size for legend item labels
+ ),
+ title = list(
+ font = list(
+ size = 12 # Set the font size for the legend title
+ )
+ )
+ )
+ )
+ converted_plot
}
-.retainCensoredDataPoints = function(plot) {
- df <- data.frame(id = seq_along(plot$x$data), legend_entries = unlist(lapply(plot$x$data, `[[`, "name")))
- for (i in seq_along(plot$x$data)) {
- if (df$legend_entries[i] != "Detected data" && df$legend_entries[i] != "Censored missing data") {
- plot$x$data[[i]]$showlegend <- FALSE
- }
+.retainCensoredDataPoints <- function(plot) {
+ df <- data.frame(id = seq_along(plot$x$data), legend_entries = unlist(lapply(plot$x$data, `[[`, "name")))
+ for (i in seq_along(plot$x$data)) {
+ if (df$legend_entries[i] != "Detected data" && df$legend_entries[i] != "Censored missing data") {
+ plot$x$data[[i]]$showlegend <- FALSE
}
- plot
+ }
+ plot
}
-.fixLegendPlotlyPlotsDataprocess = function(plot) {
- df <- data.frame(id = seq_along(plot$x$data), legend_entries = unlist(lapply(plot$x$data, `[[`, "name")))
- df$legend_group <- gsub("^\\((.*?),.*", "\\1", df$legend_entries)
- df$is_first <- !duplicated(df$legend_group)
- df$is_bool <- ifelse(grepl("TRUE|FALSE", df$legend_group), TRUE, FALSE)
- # df[nrow(df), "is_first"] <- FALSE
- plot$x$data[[nrow(df)]]$showlegend <- FALSE # remove text legend
- for (i in df$id) {
- is_first <- df$is_first[[i]]
- is_bool <- df$is_bool[[i]]
- plot$x$data[[i]]$name <- df$legend_group[[i]]
- plot$x$data[[i]]$legendgroup <- plot$x$data[[i]]$name
- if (!is_first) plot$x$data[[i]]$showlegend <- FALSE
- if(is_bool) plot$x$data[[i]]$showlegend <- FALSE
- }
- plot
-
+.fixLegendPlotlyPlotsDataprocess <- function(plot) {
+ df <- data.frame(id = seq_along(plot$x$data), legend_entries = unlist(lapply(plot$x$data, `[[`, "name")))
+ df$legend_group <- gsub("^\\((.*?),.*", "\\1", df$legend_entries)
+ df$is_first <- !duplicated(df$legend_group)
+ df$is_bool <- ifelse(grepl("TRUE|FALSE", df$legend_group), TRUE, FALSE)
+ # df[nrow(df), "is_first"] <- FALSE
+ plot$x$data[[nrow(df)]]$showlegend <- FALSE # remove text legend
+ for (i in df$id) {
+ is_first <- df$is_first[[i]]
+ is_bool <- df$is_bool[[i]]
+ plot$x$data[[i]]$name <- df$legend_group[[i]]
+ plot$x$data[[i]]$legendgroup <- plot$x$data[[i]]$name
+ if (!is_first) plot$x$data[[i]]$showlegend <- FALSE
+ if (is_bool) plot$x$data[[i]]$showlegend <- FALSE
+ }
+ plot
}
-.fixCensoredPointsLegendProfilePlotsPlotly = function(plot) {
- df <- data.frame(id = seq_along(plot$x$data), legend_entries = unlist(lapply(plot$x$data, `[[`, "name")))
- first_false_index <- which(df$legend_entries == "FALSE")[1]
- first_true_index <- which(df$legend_entries == "TRUE")[1]
+.fixCensoredPointsLegendProfilePlotsPlotly <- function(plot) {
+ df <- data.frame(id = seq_along(plot$x$data), legend_entries = unlist(lapply(plot$x$data, `[[`, "name")))
+ first_false_index <- which(df$legend_entries == "FALSE")[1]
+ first_true_index <- which(df$legend_entries == "TRUE")[1]
- # Update plot data for the first occurrence of "FALSE"
- if (!is.na(first_false_index)) {
- plot$x$data[[first_false_index]]$name <- "Detected data"
- plot$x$data[[first_false_index]]$showlegend <- TRUE
- }
+ # Update plot data for the first occurrence of "FALSE"
+ if (!is.na(first_false_index)) {
+ plot$x$data[[first_false_index]]$name <- "Detected data"
+ plot$x$data[[first_false_index]]$showlegend <- TRUE
+ }
- # Update plot data for the first occurrence of "TRUE"
- if (!is.na(first_true_index)) {
- plot$x$data[[first_true_index]]$name <- "Censored missing data"
- plot$x$data[[first_true_index]]$showlegend <- TRUE
- }
- plot
+ # Update plot data for the first occurrence of "TRUE"
+ if (!is.na(first_true_index)) {
+ plot$x$data[[first_true_index]]$name <- "Censored missing data"
+ plot$x$data[[first_true_index]]$showlegend <- TRUE
+ }
+ plot
}
-.fixLegendPlotlyPlotsVolcano = function(plot) {
- df <- data.frame(id = seq_along(plot$x$data), legend_entries = unlist(lapply(plot$x$data, `[[`, "name")))
- # Create a mapping
- color_mapping <- c("black" = "No regulation", "red" = "Up-regulated", "blue" = "Down-regulated")
- # Update the legend_entries column
- df$legend_group <- sapply(df$legend_entries, function(entry) {
- for (color in names(color_mapping)) {
- if (grepl(color, entry)) {
- entry <- gsub(color, color_mapping[color], entry)
- break
- }
- }
- entry <- gsub(",.+", "", entry)
- entry <- gsub("\\(|\\)", "", entry) # Remove any remaining parentheses
- entry
- })
- for (i in df$id) {
- if(length(grep(df$legend_group[[i]], color_mapping)) == 0) { # keep only 3 legends
- plot$x$data[[i]]$showlegend <- FALSE
- }
- plot$x$data[[i]]$name <- df$legend_group[[i]]
- plot$x$data[[i]]$legendgroup <- plot$x$data[[i]]$name
+.fixLegendPlotlyPlotsVolcano <- function(plot) {
+ df <- data.frame(id = seq_along(plot$x$data), legend_entries = unlist(lapply(plot$x$data, `[[`, "name")))
+ # Create a mapping
+ color_mapping <- c("black" = "No regulation", "red" = "Up-regulated", "blue" = "Down-regulated")
+ # Update the legend_entries column
+ df$legend_group <- sapply(df$legend_entries, function(entry) {
+ for (color in names(color_mapping)) {
+ if (grepl(color, entry)) {
+ entry <- gsub(color, color_mapping[color], entry)
+ break
+ }
+ }
+ entry <- gsub(",.+", "", entry)
+ entry <- gsub("\\(|\\)", "", entry) # Remove any remaining parentheses
+ entry
+ })
+ for (i in df$id) {
+ if (length(grep(df$legend_group[[i]], color_mapping)) == 0) { # keep only 3 legends
+ plot$x$data[[i]]$showlegend <- FALSE
}
- plot <- plotly::layout(plot,legend=list(title=list(text="")))
- plot
+ plot$x$data[[i]]$name <- df$legend_group[[i]]
+ plot$x$data[[i]]$legendgroup <- plot$x$data[[i]]$name
+ }
+ plot <- plotly::layout(plot, legend = list(title = list(text = "")))
+ plot
}
-.getPlotlyPlotHTML = function(plots, width, height) {
- doc <- htmltools::tagList(lapply(plots,function(x) htmltools::div(x, style = "float:left;width:100%;")))
- # Set a specific width for each plot
- plot_width <- 800
- plot_height <- 600
+.getPlotlyPlotHTML <- function(plots, width, height) {
+ doc <- htmltools::tagList(lapply(plots, function(x) htmltools::div(x, style = "float:left;width:100%;")))
+ # Set a specific width for each plot
+ plot_width <- 800
+ plot_height <- 600
- # Create a div for each plot with style settings
- divs <- lapply(plots, function(x) {
- htmltools::div(x, style = paste0("width:", plot_width, "px; height:", plot_height, "px; margin: 10px;"))
- })
+ # Create a div for each plot with style settings
+ divs <- lapply(plots, function(x) {
+ htmltools::div(x, style = paste0("width:", plot_width, "px; height:", plot_height, "px; margin: 10px;"))
+ })
- # Combine the divs into a tagList
- doc <- htmltools::tagList(divs)
- doc
+ # Combine the divs into a tagList
+ doc <- htmltools::tagList(divs)
+ doc
}
-.savePlotlyPlotHTML = function(plots, address, file_name, width, height) {
- print("Saving plots as HTML")
- pb <- txtProgressBar(min = 0, max = 4, style = 3)
-
- setTxtProgressBar(pb, 1)
- file_name = getFileName(address, file_name, width, height)
- file_name = paste0(file_name,".html")
-
- setTxtProgressBar(pb, 2)
- doc <- .getPlotlyPlotHTML(plots, width, height)
-
- setTxtProgressBar(pb, 3)
- htmltools::save_html(html = doc, file = file_name) # works but lib same folder
-
- setTxtProgressBar(pb, 4)
- zip(paste0(gsub("\\.html$", "", file_name),".zip"), c(file_name, "lib"))
- unlink(file_name)
- unlink("lib",recursive = TRUE)
-
- close(pb)
+.savePlotlyPlotHTML <- function(plots, address, file_name, width, height) {
+ print("Saving plots as HTML")
+ pb <- txtProgressBar(min = 0, max = 4, style = 3)
+
+ setTxtProgressBar(pb, 1)
+ file_name <- getFileName(address, file_name, width, height)
+ file_name <- paste0(file_name, ".html")
+
+ setTxtProgressBar(pb, 2)
+ doc <- .getPlotlyPlotHTML(plots, width, height)
+
+ setTxtProgressBar(pb, 3)
+ htmltools::save_html(html = doc, file = file_name) # works but lib same folder
+
+ setTxtProgressBar(pb, 4)
+ zip(paste0(gsub("\\.html$", "", file_name), ".zip"), c(file_name, "lib"))
+ unlink(file_name)
+ unlink("lib", recursive = TRUE)
+
+ close(pb)
}
diff --git a/R/designSampleSize.R b/R/designSampleSize.R
index 4bb21aeb..407c1333 100644
--- a/R/designSampleSize.R
+++ b/R/designSampleSize.R
@@ -1,149 +1,165 @@
#' Planning future experimental designs of Selected Reaction Monitoring (SRM), Data-Dependent Acquisition (DDA or shotgun), and Data-Independent Acquisition (DIA or SWATH-MS) experiments in sample size calculation
#'
-#' @description Calculate sample size for future experiments of a Selected Reaction Monitoring (SRM),
-#' Data-Dependent Acquisition (DDA or shotgun), and Data-Independent Acquisition (DIA or SWATH-MS) experiment
-#' based on intensity-based linear model. Two options of the calculation:
-#' (1) number of biological replicates per condition,
+#' @description Calculate sample size for future experiments of a Selected Reaction Monitoring (SRM),
+#' Data-Dependent Acquisition (DDA or shotgun), and Data-Independent Acquisition (DIA or SWATH-MS) experiment
+#' based on intensity-based linear model. Two options of the calculation:
+#' (1) number of biological replicates per condition,
#' (2) power.
-#'
+#'
#' @param data 'FittedModel' in testing output from function groupComparison.
-#' @param desiredFC the range of a desired fold change which includes the lower
+#' @param desiredFC the range of a desired fold change which includes the lower
#' and upper values of the desired fold change.
-#' @param FDR a pre-specified false discovery ratio (FDR) to control the overall
+#' @param FDR a pre-specified false discovery ratio (FDR) to control the overall
#' false positive rate. Default is 0.05
-#' @param numSample minimal number of biological replicates per condition.
-#' TRUE represents you require to calculate the sample size for this category,
+#' @param numSample minimal number of biological replicates per condition.
+#' TRUE represents you require to calculate the sample size for this category,
#' else you should input the exact number of biological replicates.
-#' @param power a pre-specified statistical power which defined as the probability
-#' of detecting a true fold change. TRUE represent you require to calculate the power
+#' @param power a pre-specified statistical power which defined as the probability
+#' of detecting a true fold change. TRUE represent you require to calculate the power
#' for this category, else you should input the average of power you expect. Default is 0.9
#' @inheritParams .documentFunction
-#'
-#' @details The function fits the model and uses variance components to calculate
-#' sample size. The underlying model fitting with intensity-based linear model with
+#'
+#' @details The function fits the model and uses variance components to calculate
+#' sample size. The underlying model fitting with intensity-based linear model with
#' technical MS run replication. Estimated sample size is rounded to 0 decimal.
-#' The function can only obtain either one of the categories of the sample size
+#' The function can only obtain either one of the categories of the sample size
#' calculation (numSample, numPep, numTran, power) at the same time.
-#'
+#'
#' @return data.frame - sample size calculation results including varibles:
#' desiredFC, numSample, FDR, and power.
-#'
+#'
#' @importFrom stats median
#' @importFrom plotly ggplotly style add_trace plot_ly subplot layout
-#'
+#'
#' @export
-#'
-#' @author Meena Choi, Ching-Yun Chang, Olga Vitek.
-#'
+#'
+#' @author Meena Choi, Ching-Yun Chang, Olga Vitek.
+#'
#' @examples
#' # Consider quantitative data (i.e. QuantData) from yeast study.
#' # A time course study with ten time points of interests and three biological replicates.
#' QuantData <- dataProcess(SRMRawData)
#' head(QuantData$FeatureLevelData)
#' ## based on multiple comparisons (T1 vs T3; T1 vs T7; T1 vs T9)
-#' comparison1<-matrix(c(-1,0,1,0,0,0,0,0,0,0),nrow=1)
-#' comparison2<-matrix(c(-1,0,0,0,0,0,1,0,0,0),nrow=1)
-#' comparison3<-matrix(c(-1,0,0,0,0,0,0,0,1,0),nrow=1)
-#' comparison<-rbind(comparison1,comparison2, comparison3)
-#' row.names(comparison)<-c("T3-T1","T7-T1","T9-T1")
-#' colnames(comparison)<-unique(QuantData$ProteinLevelData$GROUP)
-#'
-#' testResultMultiComparisons<-groupComparison(contrast.matrix=comparison,data=QuantData)
-#'
+#' comparison1 <- matrix(c(-1, 0, 1, 0, 0, 0, 0, 0, 0, 0), nrow = 1)
+#' comparison2 <- matrix(c(-1, 0, 0, 0, 0, 0, 1, 0, 0, 0), nrow = 1)
+#' comparison3 <- matrix(c(-1, 0, 0, 0, 0, 0, 0, 0, 1, 0), nrow = 1)
+#' comparison <- rbind(comparison1, comparison2, comparison3)
+#' row.names(comparison) <- c("T3-T1", "T7-T1", "T9-T1")
+#' colnames(comparison) <- unique(QuantData$ProteinLevelData$GROUP)
+#'
+#' testResultMultiComparisons <- groupComparison(contrast.matrix = comparison, data = QuantData)
+#'
#' ## Calculate sample size for future experiments:
-#' #(1) Minimal number of biological replicates per condition
-#' designSampleSize(data=testResultMultiComparisons$FittedModel, numSample=TRUE,
-#' desiredFC=c(1.25,1.75), FDR=0.05, power=0.8)
-#' #(2) Power calculation
-#' designSampleSize(data=testResultMultiComparisons$FittedModel, numSample=2,
-#' desiredFC=c(1.25,1.75), FDR=0.05, power=TRUE)
-#'
-designSampleSize = function(
+#' # (1) Minimal number of biological replicates per condition
+#' designSampleSize(
+#' data = testResultMultiComparisons$FittedModel, numSample = TRUE,
+#' desiredFC = c(1.25, 1.75), FDR = 0.05, power = 0.8
+#' )
+#' # (2) Power calculation
+#' designSampleSize(
+#' data = testResultMultiComparisons$FittedModel, numSample = 2,
+#' desiredFC = c(1.25, 1.75), FDR = 0.05, power = TRUE
+#' )
+#'
+designSampleSize <- function(
data, desiredFC, FDR = 0.05, numSample = TRUE, power = 0.9,
- use_log_file = TRUE, append = FALSE, verbose = TRUE, log_file_path = NULL
-) {
- MSstatsConvert::MSstatsLogsSettings(use_log_file, append, verbose,
- log_file_path, "MSstats_sampleSize_log_")
- getOption("MSstatsLog")("INFO", "** MSstats - designSampleSize function")
- getOption("MSstatsLog")("INFO", paste0("Desired fold change = ",
- paste(desiredFC, collapse=" - ")))
- getOption("MSstatsLog")("INFO", paste0("FDR = ", FDR))
- getOption("MSstatsLog")("INFO", paste0("Power = ", power))
-
- var_component = .getVarComponent(data)
- ## for label-free DDA, there are lots of missingness and lots of zero SE. So, remove NA SE.
- median_sigma_error = median(var_component[["Error"]], na.rm = TRUE)
- median_sigma_subject = .getMedianSigmaSubject(var_component)
- getOption("MSstatsLog")("INFO", "Calculated variance component. - okay")
+ use_log_file = TRUE, append = FALSE, verbose = TRUE, log_file_path = NULL) {
+ MSstatsConvert::MSstatsLogsSettings(
+ use_log_file, append, verbose,
+ log_file_path, "MSstats_sampleSize_log_"
+ )
+ getOption("MSstatsLog")("INFO", "** MSstats - designSampleSize function")
+ getOption("MSstatsLog")("INFO", paste0(
+ "Desired fold change = ",
+ paste(desiredFC, collapse = " - ")
+ ))
+ getOption("MSstatsLog")("INFO", paste0("FDR = ", FDR))
+ getOption("MSstatsLog")("INFO", paste0("Power = ", power))
- ## power calculation
- if (isTRUE(power)) {
- delta = log2(seq(desiredFC[1], desiredFC[2], 0.025))
- desiredFC = 2 ^ delta
- power_output = .calculatePower(desiredFC, FDR, delta, median_sigma_error,
- median_sigma_subject, numSample)
- CV = round( (2 * (median_sigma_error / numSample + median_sigma_subject / numSample)) / desiredFC, 3)
- getOption("MSstatsLog")("INFO", "Power is calculated. - okay")
- sample_size = data.frame(desiredFC, numSample, FDR,
- power = power_output, CV)
- }
-
- if (is.numeric(power)) {
- delta = log2(seq(desiredFC[1], desiredFC[2], 0.025))
- desiredFC = 2 ^ delta
- ## Large portion of proteins are not changing
- m0_m1 = 99 ## it means m0/m1=99, m0/(m0+m1)=0.99
- alpha = power * FDR / (1 + (1 - FDR) * m0_m1)
- if (isTRUE(numSample)) {
- numSample = .getNumSample(desiredFC, power, alpha, delta,
- median_sigma_error, median_sigma_subject)
- CV = round(2 * (median_sigma_error / numSample + median_sigma_subject / numSample) / desiredFC, 3)
- getOption("MSstatsLog")("INFO", "The number of sample is calculated. - okay")
- sample_size = data.frame(desiredFC, numSample, FDR, power, CV)
- }
- }
- sample_size
+ var_component <- .getVarComponent(data)
+ ## for label-free DDA, there are lots of missingness and lots of zero SE. So, remove NA SE.
+ median_sigma_error <- median(var_component[["Error"]], na.rm = TRUE)
+ median_sigma_subject <- .getMedianSigmaSubject(var_component)
+ getOption("MSstatsLog")("INFO", "Calculated variance component. - okay")
+
+ ## power calculation
+ if (isTRUE(power)) {
+ delta <- log2(seq(desiredFC[1], desiredFC[2], 0.025))
+ desiredFC <- 2^delta
+ power_output <- .calculatePower(
+ desiredFC, FDR, delta, median_sigma_error,
+ median_sigma_subject, numSample
+ )
+ CV <- round((2 * (median_sigma_error / numSample + median_sigma_subject / numSample)) / desiredFC, 3)
+ getOption("MSstatsLog")("INFO", "Power is calculated. - okay")
+ sample_size <- data.frame(desiredFC, numSample, FDR,
+ power = power_output, CV
+ )
+ }
+
+ if (is.numeric(power)) {
+ delta <- log2(seq(desiredFC[1], desiredFC[2], 0.025))
+ desiredFC <- 2^delta
+ ## Large portion of proteins are not changing
+ m0_m1 <- 99 ## it means m0/m1=99, m0/(m0+m1)=0.99
+ alpha <- power * FDR / (1 + (1 - FDR) * m0_m1)
+ if (isTRUE(numSample)) {
+ numSample <- .getNumSample(
+ desiredFC, power, alpha, delta,
+ median_sigma_error, median_sigma_subject
+ )
+ CV <- round(2 * (median_sigma_error / numSample + median_sigma_subject / numSample) / desiredFC, 3)
+ getOption("MSstatsLog")("INFO", "The number of sample is calculated. - okay")
+ sample_size <- data.frame(desiredFC, numSample, FDR, power, CV)
+ }
+ }
+ sample_size
}
#' Get variances from models fitted by the groupComparison function
#' @param fitted_models FittedModels element of groupComparison output
#' @keywords internal
-.getVarComponent = function(fitted_models) {
- Protein = NULL
-
- result = data.table::rbindlist(
- lapply(fitted_models, function(fit) {
- if (!is.null(fit)) {
- if (!is(fit, "lmerMod")) {
- error = summary(fit)$sigma^2
- subject <- NA
- group_subject <- NA
- } else {
- stddev = c(sapply(lme4::VarCorr(fit), function(el) attr(el, "stddev")),
- attr(lme4::VarCorr(fit), "sc"))
- error = stddev[names(stddev) == ""]^2
- if (any(names(stddev) %in% "SUBJECT.(Intercept)")) {
- subject = stddev["SUBJECT.(Intercept)"]^2
- } else {
- subject = NA
- }
- if (any(names(stddev) %in% "SUBJECT:GROUP.(Intercept)")) {
- group_subject = stddev["SUBJECT:GROUP.(Intercept)"]^2
- } else {
- group_subject = NA
- }
- }
- list(Error = error,
- Subject = subject,
- GroupBySubject = group_subject)
- } else {
- NULL
- }
- })
- )
- result[, Protein := 1:.N]
- result
+.getVarComponent <- function(fitted_models) {
+ Protein <- NULL
+
+ result <- data.table::rbindlist(
+ lapply(fitted_models, function(fit) {
+ if (!is.null(fit)) {
+ if (!is(fit, "lmerMod")) {
+ error <- summary(fit)$sigma^2
+ subject <- NA
+ group_subject <- NA
+ } else {
+ stddev <- c(
+ sapply(lme4::VarCorr(fit), function(el) attr(el, "stddev")),
+ attr(lme4::VarCorr(fit), "sc")
+ )
+ error <- stddev[names(stddev) == ""]^2
+ if (any(names(stddev) %in% "SUBJECT.(Intercept)")) {
+ subject <- stddev["SUBJECT.(Intercept)"]^2
+ } else {
+ subject <- NA
+ }
+ if (any(names(stddev) %in% "SUBJECT:GROUP.(Intercept)")) {
+ group_subject <- stddev["SUBJECT:GROUP.(Intercept)"]^2
+ } else {
+ group_subject <- NA
+ }
+ }
+ list(
+ Error = error,
+ Subject = subject,
+ GroupBySubject = group_subject
+ )
+ } else {
+ NULL
+ }
+ })
+ )
+ result[, Protein := 1:.N]
+ result
}
@@ -151,17 +167,17 @@ designSampleSize = function(
#' @param var_component data.frame, output of .getVarComponent
#' @importFrom stats median
#' @keywords internal
-.getMedianSigmaSubject = function(var_component) {
- if (sum(!is.na(var_component[, "GroupBySubject"])) > 0) {
- median_sigma_subject = median(var_component[["GroupBySubject"]], na.rm=TRUE)
+.getMedianSigmaSubject <- function(var_component) {
+ if (sum(!is.na(var_component[, "GroupBySubject"])) > 0) {
+ median_sigma_subject <- median(var_component[["GroupBySubject"]], na.rm = TRUE)
+ } else {
+ if (sum(!is.na(var_component[, "Subject"])) > 0) {
+ median_sigma_subject <- median(var_component[["Subject"]], na.rm = TRUE)
} else {
- if (sum(!is.na(var_component[, "Subject"])) > 0) {
- median_sigma_subject = median(var_component[["Subject"]], na.rm=TRUE)
- } else {
- median_sigma_subject = 0
- }
+ median_sigma_subject <- 0
}
- median_sigma_subject
+ }
+ median_sigma_subject
}
@@ -172,18 +188,18 @@ designSampleSize = function(
#' @param median_sigma_subject median standard deviation per subject
#' @importFrom stats qnorm
#' @keywords internal
-.calculatePower = function(desiredFC, FDR, delta, median_sigma_error,
- median_sigma_subject, numSample) {
- m0_m1 = 99
- t = delta / sqrt(2 * (median_sigma_error / numSample + median_sigma_subject / numSample))
- powerTemp = seq(0, 1, 0.01)
- power = numeric(length(t))
- for (i in seq_along(t)) {
- diff = qnorm(powerTemp) + qnorm(1 - powerTemp * FDR / (1 + (1 - FDR) * m0_m1) / 2) - t[i]
- min(abs(diff), na.rm = TRUE)
- power[i] = powerTemp[order(abs(diff))][1]
- }
- power
+.calculatePower <- function(desiredFC, FDR, delta, median_sigma_error,
+ median_sigma_subject, numSample) {
+ m0_m1 <- 99
+ t <- delta / sqrt(2 * (median_sigma_error / numSample + median_sigma_subject / numSample))
+ powerTemp <- seq(0, 1, 0.01)
+ power <- numeric(length(t))
+ for (i in seq_along(t)) {
+ diff <- qnorm(powerTemp) + qnorm(1 - powerTemp * FDR / (1 + (1 - FDR) * m0_m1) / 2) - t[i]
+ min(abs(diff), na.rm = TRUE)
+ power[i] <- powerTemp[order(abs(diff))][1]
+ }
+ power
}
@@ -194,167 +210,192 @@ designSampleSize = function(
#' @param delta difference between means (?)
#' @importFrom stats qnorm
#' @keywords internal
-.getNumSample = function(desiredFC, power, alpha, delta, median_sigma_error,
- median_sigma_subject){
- z_alpha = qnorm(1 - alpha / 2)
- z_beta = qnorm(power)
- aa = (delta / (z_alpha + z_beta)) ^ 2
- numSample = round(2 * (median_sigma_error + median_sigma_subject) / aa, 0)
- numSample
+.getNumSample <- function(desiredFC, power, alpha, delta, median_sigma_error,
+ median_sigma_subject) {
+ z_alpha <- qnorm(1 - alpha / 2)
+ z_beta <- qnorm(power)
+ aa <- (delta / (z_alpha + z_beta))^2
+ numSample <- round(2 * (median_sigma_error + median_sigma_subject) / aa, 0)
+ numSample
}
#' Visualization for sample size calculation
-#'
-#' @description To illustrate the relationship of desired fold change and the calculated
-#' minimal number sample size which are (1) number of biological replicates per condition,
-#' (2) number of peptides per protein,
-#' (3) number of transitions per peptide, and
+#'
+#' @description To illustrate the relationship of desired fold change and the calculated
+#' minimal number sample size which are (1) number of biological replicates per condition,
+#' (2) number of peptides per protein,
+#' (3) number of transitions per peptide, and
#' (4) power. The input is the result from function (\code{\link{designSampleSize}}.
-#'
+#'
#' @param data output from function designSampleSize.
-#' @param isPlotly Parameter to use Plotly or ggplot2. If set to TRUE, MSstats
+#' @param isPlotly Parameter to use Plotly or ggplot2. If set to TRUE, MSstats
#' will save Plotly plots as HTML files. If set to FALSE MSstats will save ggplot2 plots
#' as PDF files
-#'
+#'
#' @details Data in the example is based on the results of sample size calculation from function \code{\link{designSampleSize}}
-#'
+#'
#' @return Plot for estimated sample size with assigned variable.
-#'
+#'
#' @export
-#'
-#' @author Meena Choi, Ching-Yun Chang, Olga Vitek.
+#'
+#' @author Meena Choi, Ching-Yun Chang, Olga Vitek.
#' @examples
#' # Based on the results of sample size calculation from function designSampleSize,
-#' # we generate a series of sample size plots for number of biological replicates, or peptides,
+#' # we generate a series of sample size plots for number of biological replicates, or peptides,
#' # or transitions or power plot.
-#' QuantData<-dataProcess(SRMRawData)
+#' QuantData <- dataProcess(SRMRawData)
#' head(QuantData$ProcessedData)
#' ## based on multiple comparisons (T1 vs T3; T1 vs T7; T1 vs T9)
-#' comparison1<-matrix(c(-1,0,1,0,0,0,0,0,0,0),nrow=1)
-#' comparison2<-matrix(c(-1,0,0,0,0,0,1,0,0,0),nrow=1)
-#' comparison3<-matrix(c(-1,0,0,0,0,0,0,0,1,0),nrow=1)
-#' comparison<-rbind(comparison1,comparison2, comparison3)
-#' row.names(comparison)<-c("T3-T1","T7-T1","T9-T1")
-#' colnames(comparison)<-unique(QuantData$ProteinLevelData$GROUP)
-#'
-#' testResultMultiComparisons<-groupComparison(contrast.matrix=comparison, data=QuantData)
-#'
+#' comparison1 <- matrix(c(-1, 0, 1, 0, 0, 0, 0, 0, 0, 0), nrow = 1)
+#' comparison2 <- matrix(c(-1, 0, 0, 0, 0, 0, 1, 0, 0, 0), nrow = 1)
+#' comparison3 <- matrix(c(-1, 0, 0, 0, 0, 0, 0, 0, 1, 0), nrow = 1)
+#' comparison <- rbind(comparison1, comparison2, comparison3)
+#' row.names(comparison) <- c("T3-T1", "T7-T1", "T9-T1")
+#' colnames(comparison) <- unique(QuantData$ProteinLevelData$GROUP)
+#'
+#' testResultMultiComparisons <- groupComparison(contrast.matrix = comparison, data = QuantData)
+#'
#' # plot the calculated sample sizes for future experiments:
#' # (1) Minimal number of biological replicates per condition
-#' result.sample<-designSampleSize(data=testResultMultiComparisons$FittedModel, numSample=TRUE,
-#' desiredFC=c(1.25,1.75), FDR=0.05, power=0.8)
-#' designSampleSizePlots(data=result.sample)
+#' result.sample <- designSampleSize(
+#' data = testResultMultiComparisons$FittedModel, numSample = TRUE,
+#' desiredFC = c(1.25, 1.75), FDR = 0.05, power = 0.8
+#' )
+#' designSampleSizePlots(data = result.sample)
#' # (2) Power
-#' result.power<-designSampleSize(data=testResultMultiComparisons$FittedModel, numSample=2,
-#' desiredFC=c(1.25,1.75), FDR=0.05, power=TRUE)
-#' designSampleSizePlots(data=result.power)
-#'
-designSampleSizePlots = function(data, isPlotly = FALSE) {
- if (length(unique(data$numSample)) > 1) {
- index = "numSample"
- } else if (length(unique(data$power)) > 1) {
- index = "power"
- } else if (length(unique(data$numSample)) == 1 & length(unique(data$power)) == 1) {
- index = "numSample"
+#' result.power <- designSampleSize(
+#' data = testResultMultiComparisons$FittedModel, numSample = 2,
+#' desiredFC = c(1.25, 1.75), FDR = 0.05, power = TRUE
+#' )
+#' designSampleSizePlots(data = result.power)
+#'
+designSampleSizePlots <- function(data, isPlotly = FALSE) {
+ if (length(unique(data$numSample)) > 1) {
+ index <- "numSample"
+ } else if (length(unique(data$power)) > 1) {
+ index <- "power"
+ } else if (length(unique(data$numSample)) == 1 & length(unique(data$power)) == 1) {
+ index <- "numSample"
+ } else {
+ stop("Invalid input")
+ }
+
+ if (isPlotly) {
+ axis.size <- 10
+ lab.size <- 12
+ text.size <- 12
+ } else {
+ text.size <- 1.2
+ axis.size <- 1.3
+ lab.size <- 1.7
+ }
+ if (index == "numSample") {
+ if (isPlotly) {
+ p <- plot_ly(data,
+ x = ~desiredFC, y = ~numSample, type = "scatter", mode = "lines",
+ line = list(width = 2)
+ )
+ p <- layout(p,
+ xaxis = list(
+ title = "Desired fold change",
+ tickvals = seq(min(data$desiredFC), max(data$desiredFC), 0.05),
+ ticktext = seq(min(data$desiredFC), max(data$desiredFC), 0.05),
+ tickfont = list(size = axis.size)
+ ),
+ yaxis = list(
+ title = "Minimal number of biological replicates",
+ tickfont = list(size = axis.size)
+ ),
+ annotations = list(
+ list(
+ x = 1, y = 1, xref = "paper", yref = "paper",
+ text = sprintf(
+ "FDR is %s
Statistical power is %s",
+ paste(unique(data$FDR), collapse = ", "),
+ paste(unique(data$power), collapse = ", ")
+ ),
+ showarrow = FALSE,
+ xanchor = "right", yanchor = "top",
+ font = list(size = text.size)
+ )
+ ),
+ margin = list(t = 50)
+ ) # Adjust top margin to avoid cutting off text
+ return(p)
} else {
- stop ("Invalid input")
+ plot(data$desiredFC, data$numSample,
+ lwd = 2, xlab = "", ylab = "",
+ cex.axis = axis.size, type = "l", xaxt = "n"
+ )
+ axis(1,
+ at = seq(min(data$desiredFC), max(data$desiredFC), 0.05),
+ labels = seq(min(data$desiredFC), max(data$desiredFC), 0.05),
+ cex.axis = axis.size
+ )
+ mtext("Desired fold change", 1, line = 3.5, cex = lab.size)
+ mtext("Minimal number of biological replicates", 2, line = 2.5, cex = lab.size)
+ legend("topright",
+ c(
+ paste("FDR is", unique(data$FDR)),
+ paste("Statistical power is", unique(data$power))
+ ),
+ bty = "n",
+ cex = text.size
+ )
}
-
- if(isPlotly) {
- axis.size = 10
- lab.size = 12
- text.size = 12
+ }
+
+ if (index == "power") {
+ if (isPlotly) {
+ p <- plot_ly(data,
+ x = ~desiredFC, y = ~power, type = "scatter", mode = "lines",
+ line = list(width = 2)
+ )
+ p <- layout(p,
+ xaxis = list(
+ title = "Desired fold change",
+ tickvals = seq(min(data$desiredFC), max(data$desiredFC), 0.05),
+ ticktext = seq(min(data$desiredFC), max(data$desiredFC), 0.05),
+ tickfont = list(size = axis.size)
+ ),
+ yaxis = list(
+ title = "Power",
+ tickfont = list(size = axis.size)
+ ),
+ showlegend = FALSE, # Hide default legend
+ annotations = list(
+ list(
+ x = 0.5, y = 0.5, xref = "paper", yref = "paper",
+ text = paste("Number of replicates is", unique(data$numSample), "
FDR is", unique(data$FDR)),
+ showarrow = FALSE,
+ xanchor = "right", yanchor = "bottom",
+ font = list(size = text.size)
+ )
+ ),
+ margin = list(b = 100)
+ ) # Adjust bottom margin if necessary
+ return(p)
} else {
- text.size = 1.2
- axis.size = 1.3
- lab.size = 1.7
- }
- if (index == "numSample") {
- if(isPlotly) {
- p <- plot_ly(data, x = ~desiredFC, y = ~numSample, type = 'scatter', mode = 'lines',
- line = list(width = 2))
- p <- layout(p, xaxis = list(
- title = "Desired fold change",
- tickvals = seq(min(data$desiredFC), max(data$desiredFC), 0.05),
- ticktext = seq(min(data$desiredFC), max(data$desiredFC), 0.05),
- tickfont = list(size = axis.size)
- ),
- yaxis = list(
- title = "Minimal number of biological replicates",
- tickfont = list(size = axis.size)
- ),
- annotations = list(
- list(
- x = 1, y = 1, xref = 'paper', yref = 'paper',
- text = sprintf("FDR is %s
Statistical power is %s",
- paste(unique(data$FDR), collapse = ", "),
- paste(unique(data$power), collapse = ", ")),
- showarrow = FALSE,
- xanchor = 'right', yanchor = 'top',
- font = list(size = text.size)
- )
- ),
- margin = list(t = 50)) # Adjust top margin to avoid cutting off text
- return(p)
- }
- else {
- plot(data$desiredFC, data$numSample,
- lwd=2, xlab="", ylab="",
- cex.axis=axis.size, type="l", xaxt="n")
- axis(1, at=seq(min(data$desiredFC), max(data$desiredFC), 0.05),
- labels=seq(min(data$desiredFC), max(data$desiredFC), 0.05),
- cex.axis=axis.size)
- mtext("Desired fold change", 1, line=3.5, cex=lab.size)
- mtext("Minimal number of biological replicates", 2, line=2.5, cex=lab.size)
- legend("topright",
- c(paste("FDR is", unique(data$FDR)),
- paste("Statistical power is", unique(data$power))),
- bty="n",
- cex=text.size)
- }
- }
-
- if (index == "power") {
- if(isPlotly) {
- p <- plot_ly(data, x = ~desiredFC, y = ~power, type = 'scatter', mode = 'lines',
- line = list(width = 2))
- p <- layout(p, xaxis = list(
- title = "Desired fold change",
- tickvals = seq(min(data$desiredFC), max(data$desiredFC), 0.05),
- ticktext = seq(min(data$desiredFC), max(data$desiredFC), 0.05),
- tickfont = list(size = axis.size)
- ),
- yaxis = list(
- title = "Power",
- tickfont = list(size = axis.size)
- ),
- showlegend = FALSE, # Hide default legend
- annotations = list(
- list(
- x = 0.5, y = 0.5, xref = 'paper', yref = 'paper',
- text = paste("Number of replicates is", unique(data$numSample), "
FDR is", unique(data$FDR)),
- showarrow = FALSE,
- xanchor = 'right', yanchor = 'bottom',
- font = list(size = text.size)
- )
- ),
- margin = list(b = 100)) # Adjust bottom margin if necessary
- return(p)
- } else {
- plot(data$desiredFC, data$power,
- lwd=2, xlab="", ylab="",
- cex.axis=axis.size, type="l", xaxt="n")
- axis(1, at=seq(min(data$desiredFC), max(data$desiredFC), 0.05),
- labels=seq(min(data$desiredFC), max(data$desiredFC), 0.05),
- cex.axis=axis.size)
- mtext("Desired fold change", 1, line=3.5, cex=lab.size)
- mtext("Power", 2, line=2.5, cex=lab.size)
- legend("bottomright",
- c(paste("Number of replicates is", unique(data$numSample)),
- paste("FDR is", unique(data$FDR))),
- bty="n",
- cex=text.size)
- }
+ plot(data$desiredFC, data$power,
+ lwd = 2, xlab = "", ylab = "",
+ cex.axis = axis.size, type = "l", xaxt = "n"
+ )
+ axis(1,
+ at = seq(min(data$desiredFC), max(data$desiredFC), 0.05),
+ labels = seq(min(data$desiredFC), max(data$desiredFC), 0.05),
+ cex.axis = axis.size
+ )
+ mtext("Desired fold change", 1, line = 3.5, cex = lab.size)
+ mtext("Power", 2, line = 2.5, cex = lab.size)
+ legend("bottomright",
+ c(
+ paste("Number of replicates is", unique(data$numSample)),
+ paste("FDR is", unique(data$FDR))
+ ),
+ bty = "n",
+ cex = text.size
+ )
}
+ }
}
diff --git a/R/groupComparison.R b/R/groupComparison.R
index 0dbc0a96..df45ce0d 100644
--- a/R/groupComparison.R
+++ b/R/groupComparison.R
@@ -5,8 +5,8 @@
#' @param save_fitted_models logical, if TRUE, fitted models will be added to
#' the output.
#' @param log_base base of the logarithm used in dataProcess.
-#' @param numberOfCores Number of cores for parallel processing. When > 1,
-#' a logfile named `MSstats_groupComparison_log_progress.log` is created to
+#' @param numberOfCores Number of cores for parallel processing. When > 1,
+#' a logfile named `MSstats_groupComparison_log_progress.log` is created to
#' track progress. Only works for Linux & Mac OS. Default is 1.
#' @inheritParams .documentFunction
#'
@@ -56,199 +56,220 @@
#' }
#' \item{FittedModel}{A list of fitted models for each protein. This is included only if `save_fitted_models` is set to TRUE. Each element of the list corresponds to a protein and contains the fitted model object.}
#' }
-#'
-#' @export
+#'
+#' @export
#' @import lme4
#' @import limma
#' @importFrom data.table rbindlist
#'
#' @examples
-#' # Consider quantitative data (i.e. QuantData) from yeast study with ten time points of interests,
-#' # three biological replicates, and no technical replicates.
+#' # Consider quantitative data (i.e. QuantData) from yeast study with ten time points of interests,
+#' # three biological replicates, and no technical replicates.
#' # It is a time-course experiment and we attempt to compare differential abundance
-#' # between time 1 and 7 in a set of targeted proteins.
-#' # In this label-based SRM experiment, MSstats uses the fitted model with expanded scope of
-#' # Biological replication.
+#' # between time 1 and 7 in a set of targeted proteins.
+#' # In this label-based SRM experiment, MSstats uses the fitted model with expanded scope of
+#' # Biological replication.
#' QuantData <- dataProcess(SRMRawData, use_log_file = FALSE)
#' head(QuantData$FeatureLevelData)
#' levels(QuantData$ProteinLevelData$GROUP)
-#' comparison <- matrix(c(-1,0,0,0,0,0,1,0,0,0),nrow=1)
+#' comparison <- matrix(c(-1, 0, 0, 0, 0, 0, 1, 0, 0, 0), nrow = 1)
#' row.names(comparison) <- "T7-T1"
-#' groups = levels(QuantData$ProteinLevelData$GROUP)
+#' groups <- levels(QuantData$ProteinLevelData$GROUP)
#' colnames(comparison) <- groups[order(as.numeric(groups))]
#' # Tests for differentially abundant proteins with models:
#' # label-based SRM experiment with expanded scope of biological replication.
-#' testResultOneComparison <- groupComparison(contrast.matrix=comparison, data=QuantData,
-#' use_log_file = FALSE)
+#' testResultOneComparison <- groupComparison(
+#' contrast.matrix = comparison, data = QuantData,
+#' use_log_file = FALSE
+#' )
#' # table for result
#' testResultOneComparison$ComparisonResult
#'
-groupComparison = function(contrast.matrix, data,
- save_fitted_models = TRUE, log_base = 2,
- use_log_file = TRUE, append = FALSE,
- verbose = TRUE, log_file_path = NULL,
- numberOfCores = 1
-) {
- MSstatsConvert::MSstatsLogsSettings(use_log_file, append, verbose,
- log_file_path,
- "MSstats_groupComparison_log_")
- getOption("MSstatsLog")("INFO", "MSstats - groupComparison function")
- labeled = data.table::uniqueN(data$FeatureLevelData$Label) > 1
- split_summarized = MSstatsPrepareForGroupComparison(data)
- repeated = checkRepeatedDesign(data)
- samples_info = getSamplesInfo(data)
- groups = unique(data$ProteinLevelData$GROUP)
- contrast_matrix = MSstatsContrastMatrix(contrast.matrix, groups)
- getOption("MSstatsLog")("INFO",
- "== Start to test and get inference in whole plot")
- getOption("MSstatsMsg")("INFO",
- " == Start to test and get inference in whole plot ...")
- testing_results = MSstatsGroupComparison(split_summarized, contrast_matrix,
- save_fitted_models, repeated, samples_info,
- numberOfCores)
- getOption("MSstatsLog")("INFO",
- "== Comparisons for all proteins are done.")
- getOption("MSstatsMsg")("INFO",
- " == Comparisons for all proteins are done.")
- MSstatsGroupComparisonOutput(testing_results, data, log_base)
+groupComparison <- function(contrast.matrix, data,
+ save_fitted_models = TRUE, log_base = 2,
+ use_log_file = TRUE, append = FALSE,
+ verbose = TRUE, log_file_path = NULL,
+ numberOfCores = 1) {
+ MSstatsConvert::MSstatsLogsSettings(
+ use_log_file, append, verbose,
+ log_file_path,
+ "MSstats_groupComparison_log_"
+ )
+ getOption("MSstatsLog")("INFO", "MSstats - groupComparison function")
+ labeled <- data.table::uniqueN(data$FeatureLevelData$Label) > 1
+ split_summarized <- MSstatsPrepareForGroupComparison(data)
+ repeated <- checkRepeatedDesign(data)
+ samples_info <- getSamplesInfo(data)
+ groups <- unique(data$ProteinLevelData$GROUP)
+ contrast_matrix <- MSstatsContrastMatrix(contrast.matrix, groups)
+ getOption("MSstatsLog")("INFO",
+ "== Start to test and get inference in whole plot")
+ getOption("MSstatsMsg")("INFO",
+ " == Start to test and get inference in whole plot ...")
+ testing_results <- MSstatsGroupComparison(
+ split_summarized, contrast_matrix,
+ save_fitted_models, repeated, samples_info,
+ numberOfCores
+ )
+ getOption("MSstatsLog")("INFO",
+ "== Comparisons for all proteins are done.")
+ getOption("MSstatsMsg")("INFO",
+ " == Comparisons for all proteins are done.")
+ MSstatsGroupComparisonOutput(testing_results, data, log_base)
}
#' Prepare output for dataProcess for group comparison
-#'
+#'
#' @param summarization_output output of dataProcess
-#'
-#' @return list of run-level data for each protein in the input.
+#'
+#' @return list of run-level data for each protein in the input.
#' This list has a "has_imputed" attribute that indicates if missing values
#' were imputed in the input dataset.
-#'
+#'
#' @export
-#'
+#'
#' @examples
#' QuantData <- dataProcess(SRMRawData, use_log_file = FALSE)
-#' group_comparison_input = MSstatsPrepareForGroupComparison(QuantData)
+#' group_comparison_input <- MSstatsPrepareForGroupComparison(QuantData)
#' length(group_comparison_input) # list of length equal to number of proteins
#' # in protein-level data of QuantData
#' head(group_comparison_input[[1]])
-MSstatsPrepareForGroupComparison = function(summarization_output) {
- has_imputed = is.element("NumImputedFeature", colnames(summarization_output$ProteinLevelData))
- summarized = data.table::as.data.table(summarization_output$ProteinLevelData)
- summarized = .checkGroupComparisonInput(summarized)
- labeled = nlevels(summarization_output$FeatureLevelData$LABEL) > 1
-
- getOption("MSstatsLog")("INFO", paste0("labeled = ", labeled))
- getOption("MSstatsLog")("INFO", "scopeOfBioReplication = expanded")
- output = split(summarized, summarized$Protein)
- attr(output, "has_imputed") = has_imputed
- output
+MSstatsPrepareForGroupComparison <- function(summarization_output) {
+ has_imputed <- is.element("NumImputedFeature", colnames(summarization_output$ProteinLevelData))
+ summarized <- data.table::as.data.table(summarization_output$ProteinLevelData)
+ summarized <- .checkGroupComparisonInput(summarized)
+ labeled <- nlevels(summarization_output$FeatureLevelData$LABEL) > 1
+
+ getOption("MSstatsLog")("INFO", paste0("labeled = ", labeled))
+ getOption("MSstatsLog")("INFO", "scopeOfBioReplication = expanded")
+ output <- split(summarized, summarized$Protein)
+ attr(output, "has_imputed") <- has_imputed
+ output
}
#' Group comparison
-#'
+#'
#' @param summarized_list output of MSstatsPrepareForGroupComparison
#' @param contrast_matrix contrast matrix
#' @param save_fitted_models if TRUE, fitted models will be included in the output
#' @param repeated logical, output of checkRepeatedDesign function
#' @param samples_info data.table, output of getSamplesInfo function
-#' @param numberOfCores Number of cores for parallel processing. When > 1,
-#' a logfile named `MSstats_groupComparison_log_progress.log` is created to
+#' @param numberOfCores Number of cores for parallel processing. When > 1,
+#' a logfile named `MSstats_groupComparison_log_progress.log` is created to
#' track progress. Only works for Linux & Mac OS.
#'
-#'
+#'
#' @export
-#'
+#'
#' @examples
#' QuantData <- dataProcess(SRMRawData, use_log_file = FALSE)
-#' group_comparison_input = MSstatsPrepareForGroupComparison(QuantData)
+#' group_comparison_input <- MSstatsPrepareForGroupComparison(QuantData)
#' levels(QuantData$ProteinLevelData$GROUP)
-#' comparison <- matrix(c(-1,0,0,0,0,0,1,0,0,0),nrow=1)
+#' comparison <- matrix(c(-1, 0, 0, 0, 0, 0, 1, 0, 0, 0), nrow = 1)
#' row.names(comparison) <- "T7-T1"
-#' groups = levels(QuantData$ProteinLevelData$GROUP)
+#' groups <- levels(QuantData$ProteinLevelData$GROUP)
#' colnames(comparison) <- groups[order(as.numeric(groups))]
-#' samples_info = getSamplesInfo(QuantData)
-#' repeated = checkRepeatedDesign(QuantData)
-#' group_comparison = MSstatsGroupComparison(group_comparison_input, comparison,
-#' FALSE, repeated, samples_info)
+#' samples_info <- getSamplesInfo(QuantData)
+#' repeated <- checkRepeatedDesign(QuantData)
+#' group_comparison <- MSstatsGroupComparison(
+#' group_comparison_input, comparison,
+#' FALSE, repeated, samples_info
+#' )
#' length(group_comparison) # list of length equal to number of proteins
#' group_comparison[[1]][[1]] # data used to fit linear model
#' group_comparison[[1]][[2]] # comparison result
#' group_comparison[[2]][[3]] # NULL, because we set save_fitted_models to FALSE
-#'
-MSstatsGroupComparison = function(summarized_list, contrast_matrix,
- save_fitted_models, repeated, samples_info,
- numberOfCores = 1) {
- if (numberOfCores > 1) {
- return(.groupComparisonWithMultipleCores(summarized_list, contrast_matrix,
- save_fitted_models, repeated,
- samples_info, numberOfCores))
- } else {
- return(.groupComparisonWithSingleCore(summarized_list, contrast_matrix,
- save_fitted_models, repeated,
- samples_info))
- }
+#'
+MSstatsGroupComparison <- function(summarized_list, contrast_matrix,
+ save_fitted_models, repeated, samples_info,
+ numberOfCores = 1) {
+ if (numberOfCores > 1) {
+ return(.groupComparisonWithMultipleCores(
+ summarized_list, contrast_matrix,
+ save_fitted_models, repeated,
+ samples_info, numberOfCores
+ ))
+ } else {
+ return(.groupComparisonWithSingleCore(
+ summarized_list, contrast_matrix,
+ save_fitted_models, repeated,
+ samples_info
+ ))
+ }
}
#' Create output of group comparison based on results for individual proteins
-#'
+#'
#' @param input output of MSstatsGroupComparison function
#' @param summarization_output output of dataProcess function
#' @param log_base base of the logarithm used in fold-change calculation
-#'
+#'
#' @importFrom stats p.adjust
-#'
+#'
#' @export
-#'
+#'
#' @return list, same as the output of `groupComparison`
-#'
-#' @examples
+#'
+#' @examples
#' QuantData <- dataProcess(SRMRawData, use_log_file = FALSE)
-#' group_comparison_input = MSstatsPrepareForGroupComparison(QuantData)
+#' group_comparison_input <- MSstatsPrepareForGroupComparison(QuantData)
#' levels(QuantData$ProteinLevelData$GROUP)
-#' comparison <- matrix(c(-1,0,0,0,0,0,1,0,0,0),nrow=1)
+#' comparison <- matrix(c(-1, 0, 0, 0, 0, 0, 1, 0, 0, 0), nrow = 1)
#' row.names(comparison) <- "T7-T1"
-#' groups = levels(QuantData$ProteinLevelData$GROUP)
+#' groups <- levels(QuantData$ProteinLevelData$GROUP)
#' colnames(comparison) <- groups[order(as.numeric(groups))]
-#' samples_info = getSamplesInfo(QuantData)
-#' repeated = checkRepeatedDesign(QuantData)
-#' group_comparison = MSstatsGroupComparison(group_comparison_input, comparison,
-#' FALSE, repeated, samples_info)
-#' group_comparison_final = MSstatsGroupComparisonOutput(group_comparison,
-#' QuantData)
-#' group_comparison_final[["ComparisonResult"]]
-#'
-MSstatsGroupComparisonOutput = function(input, summarization_output, log_base = 2) {
- adj.pvalue = pvalue = issue = NULL
-
- has_imputed = is.element("NumImputedFeature", colnames(summarization_output$ProteinLevelData))
- model_qc_data = lapply(input, function(x) x[[1]])
- comparisons = lapply(input, function(x) x[[2]])
- fitted_models = lapply(input, function(x) x[[3]])
- comparisons = data.table::rbindlist(comparisons, fill = TRUE)
- comparisons[, adj.pvalue := p.adjust(pvalue, method = "BH"),
- by = "Label"]
- logFC_colname = paste0("log", log_base, "FC")
- comparisons[, adj.pvalue := ifelse(!is.na(issue) &
- issue == "oneConditionMissing",
- 0, adj.pvalue)]
- data.table::setnames(comparisons, "logFC", logFC_colname)
- qc = rbindlist(model_qc_data, fill = TRUE)
- cols = c("Protein", "Label", logFC_colname, "SE", "Tvalue", "DF",
- "pvalue", "adj.pvalue", "issue", "MissingPercentage",
- "ImputationPercentage")
- if (!has_imputed) {
- cols = cols[1:10]
- }
- getOption("MSstatsLog")("INFO", "The output for groupComparison is ready.")
- list(ComparisonResult = as.data.frame(comparisons)[, cols],
- ModelQC = as.data.frame(qc),
- FittedModel = fitted_models)
+#' samples_info <- getSamplesInfo(QuantData)
+#' repeated <- checkRepeatedDesign(QuantData)
+#' group_comparison <- MSstatsGroupComparison(
+#' group_comparison_input, comparison,
+#' FALSE, repeated, samples_info
+#' )
+#' group_comparison_final <- MSstatsGroupComparisonOutput(
+#' group_comparison,
+#' QuantData
+#' )
+#' group_comparison_final[["ComparisonResult"]]
+#'
+MSstatsGroupComparisonOutput <- function(input, summarization_output, log_base = 2) {
+ adj.pvalue <- pvalue <- issue <- NULL
+
+ has_imputed <- is.element("NumImputedFeature", colnames(summarization_output$ProteinLevelData))
+ model_qc_data <- lapply(input, function(x) x[[1]])
+ comparisons <- lapply(input, function(x) x[[2]])
+ fitted_models <- lapply(input, function(x) x[[3]])
+ comparisons <- data.table::rbindlist(comparisons, fill = TRUE)
+ comparisons[, adj.pvalue := p.adjust(pvalue, method = "BH"),
+ by = "Label"
+ ]
+ logFC_colname <- paste0("log", log_base, "FC")
+ comparisons[, adj.pvalue := ifelse(!is.na(issue) &
+ issue == "oneConditionMissing",
+ 0, adj.pvalue
+ )]
+ data.table::setnames(comparisons, "logFC", logFC_colname)
+ qc <- rbindlist(model_qc_data, fill = TRUE)
+ cols <- c(
+ "Protein", "Label", logFC_colname, "SE", "Tvalue", "DF",
+ "pvalue", "adj.pvalue", "issue", "MissingPercentage",
+ "ImputationPercentage"
+ )
+ if (!has_imputed) {
+ cols <- cols[1:10]
+ }
+ getOption("MSstatsLog")("INFO", "The output for groupComparison is ready.")
+ list(
+ ComparisonResult = as.data.frame(comparisons)[, cols],
+ ModelQC = as.data.frame(qc),
+ FittedModel = fitted_models
+ )
}
#' Group comparison for a single protein
-#'
+#'
#' @param single_protein data.table with summarized data for a single protein
#' @param contrast_matrix contrast matrix
#' @param repeated if TRUE, repeated measurements will be modeled
@@ -257,44 +278,51 @@ MSstatsGroupComparisonOutput = function(input, summarization_output, log_base =
#' @param save_fitted_models if TRUE, fitted model will be saved.
#' If not, it will be replaced with NULL
#' @param has_imputed TRUE if missing values have been imputed
-#'
+#'
#' @export
-#'
-#' @examples
+#'
+#' @examples
#' QuantData <- dataProcess(SRMRawData, use_log_file = FALSE)
#' group_comparison_input <- MSstatsPrepareForGroupComparison(QuantData)
#' levels(QuantData$ProteinLevelData$GROUP)
-#' comparison <- matrix(c(-1,0,0,0,0,0,1,0,0,0),nrow=1)
+#' comparison <- matrix(c(-1, 0, 0, 0, 0, 0, 1, 0, 0, 0), nrow = 1)
#' row.names(comparison) <- "T7-T1"
-#' groups = levels(QuantData$ProteinLevelData$GROUP)
+#' groups <- levels(QuantData$ProteinLevelData$GROUP)
#' colnames(comparison) <- groups[order(as.numeric(groups))]
#' samples_info <- getSamplesInfo(QuantData)
#' repeated <- checkRepeatedDesign(QuantData)
#' single_output <- MSstatsGroupComparisonSingleProtein(
#' group_comparison_input[[1]], comparison, repeated, groups, samples_info,
-#' FALSE, TRUE)
+#' FALSE, TRUE
+#' )
#' single_output # same as a single element of MSstatsGroupComparison output
-#'
-MSstatsGroupComparisonSingleProtein = function(single_protein, contrast_matrix,
- repeated, groups, samples_info,
- save_fitted_models,
- has_imputed) {
- single_protein = .prepareSingleProteinForGC(single_protein)
- is_single_subject = .checkSingleSubject(single_protein)
- has_tech_reps = .checkTechReplicate(single_protein)
-
- fitted_model = try(.fitModelSingleProtein(single_protein, contrast_matrix,
- has_tech_reps, is_single_subject,
- repeated, groups, samples_info,
- save_fitted_models, has_imputed),
- silent = TRUE)
- if (inherits(fitted_model, "try-error")) {
- result = list(list(Protein = unique(single_protein$Protein),
- Label = row.names(contrast_matrix),
- logFC = NA, SE = NA, Tvalue = NA,
- DF = NA, pvalue = NA, issue = NA), NULL)
- } else {
- result = fitted_model
- }
- list(single_protein, result[[1]], result[[2]])
+#'
+MSstatsGroupComparisonSingleProtein <- function(single_protein, contrast_matrix,
+ repeated, groups, samples_info,
+ save_fitted_models,
+ has_imputed) {
+ single_protein <- .prepareSingleProteinForGC(single_protein)
+ is_single_subject <- .checkSingleSubject(single_protein)
+ has_tech_reps <- .checkTechReplicate(single_protein)
+
+ fitted_model <- try(
+ .fitModelSingleProtein(
+ single_protein, contrast_matrix,
+ has_tech_reps, is_single_subject,
+ repeated, groups, samples_info,
+ save_fitted_models, has_imputed
+ ),
+ silent = TRUE
+ )
+ if (inherits(fitted_model, "try-error")) {
+ result <- list(list(
+ Protein = unique(single_protein$Protein),
+ Label = row.names(contrast_matrix),
+ logFC = NA, SE = NA, Tvalue = NA,
+ DF = NA, pvalue = NA, issue = NA
+ ), NULL)
+ } else {
+ result <- fitted_model
+ }
+ list(single_protein, result[[1]], result[[2]])
}
diff --git a/R/groupComparisonPlots.R b/R/groupComparisonPlots.R
index 9ce71bcf..89f5f672 100644
--- a/R/groupComparisonPlots.R
+++ b/R/groupComparisonPlots.R
@@ -1,12 +1,12 @@
#' Visualization for model-based analysis and summarizing differentially abundant proteins
-#'
-#' @description To summarize the results of log-fold changes and adjusted p-values for differentially abundant proteins,
-#' groupComparisonPlots takes testing results from function (\code{\link{groupComparison}}) as input and
-#' automatically generate three types of figures in pdf files as output :
-#' (1) volcano plot (specify "VolcanoPlot" in option type) for each comparison separately;
-#' (2) heatmap (specify "Heatmap" in option type) for multiple comparisons ;
+#'
+#' @description To summarize the results of log-fold changes and adjusted p-values for differentially abundant proteins,
+#' groupComparisonPlots takes testing results from function (\code{\link{groupComparison}}) as input and
+#' automatically generate three types of figures in pdf files as output :
+#' (1) volcano plot (specify "VolcanoPlot" in option type) for each comparison separately;
+#' (2) heatmap (specify "Heatmap" in option type) for multiple comparisons ;
#' (3) comparison plot (specify "ComparisonPlot" in option type) for multiple comparisons per protein.
-#'
+#'
#' @param data 'ComparisonResult' in testing output from function groupComparison.
#' @param type choice of visualization. "VolcanoPlot" represents volcano plot of log fold changes and adjusted p-values for each comparison separately. "Heatmap" represents heatmap of adjusted p-values for multiple comparisons. "ComparisonPlot" represents comparison plot of log fold changes for multiple comparisons per protein.
#' @param sig FDR cutoff for the adjusted p-values in heatmap and volcano plot. level of significance for comparison plot. 100(1-sig)\% confidence interval will be drawn. sig=0.05 is default.
@@ -24,150 +24,169 @@
#' @param ProteinName for volcano plot only, whether display protein names or not. TRUE (default) means protein names, which are significant, are displayed next to the points. FALSE means no protein names are displayed.
#' @param colorkey TRUE(default) shows colorkey.
#' @param numProtein For ggplot2: The number of proteins which will be presented in each heatmap. Default is 100. Maximum possible number of protein for one heatmap is 180.
-#' For Plotly: use this parameter to adjust the number of proteins to be displayed on the heatmap
+#' For Plotly: use this parameter to adjust the number of proteins to be displayed on the heatmap
#' @param clustering Determines how to order proteins and comparisons. Hierarchical cluster analysis with Ward method(minimum variance) is performed. 'protein' means that protein dendrogram is computed and reordered based on protein means (the order of row is changed). 'comparison' means comparison dendrogram is computed and reordered based on comparison means (the order of comparison is changed). 'both' means to reorder both protein and comparison. Default is 'protein'.
#' @param width width of the saved file in pixels. Default is 800.
#' @param height height of the saved file in pixels. Default is 600.
#' @param which.Comparison list of comparisons to draw plots. List can be labels of comparisons or order numbers of comparisons from levels(data$Label), such as levels(testResultMultiComparisons$ComparisonResult$Label). Default is "all", which generates all plots for each protein.
#' @param which.Protein Protein list to draw comparison plots. List can be names of Proteins or order numbers of Proteins from levels(testResultMultiComparisons$ComparisonResult$Protein). Default is "all", which generates all comparison plots for each protein.
#' @param address the name of folder that will store the results. Default folder is the current working directory. The other assigned folder has to be existed under the current working directory. An output pdf file is automatically created with the default name of "VolcanoPlot.pdf" or "Heatmap.pdf" or "ComparisonPlot.pdf". The command address can help to specify where to store the file as well as how to modify the beginning of the file name. If address=FALSE, plot will be not saved as pdf file but showed in window.
-#' @param isPlotly This parameter is for MSstatsShiny application for plotly
-#' render, this cannot be used for saving PDF files as plotly do not have
+#' @param isPlotly This parameter is for MSstatsShiny application for plotly
+#' render, this cannot be used for saving PDF files as plotly do not have
#' suppprt for PDFs currently. address and isPlotly cannot be set as TRUE at the
#' same time.
-#'
-#'
-#' @details
+#'
+#'
+#' @details
#' \itemize{
#' \item{Volcano plot : illustrate actual log-fold changes and adjusted p-values for each comparison separately with all proteins. The x-axis is the log fold change. The base of logarithm transformation is the same as specified in "logTrans" from \code{\link{dataProcess}}. The y-axis is the negative log2 or log10 adjusted p-values. The horizontal dashed line represents the FDR cutoff. The points below the FDR cutoff line are non-significantly abundant proteins (colored in black). The points above the FDR cutoff line are significantly abundant proteins (colored in red/blue for up-/down-regulated). If fold change cutoff is specified (FCcutoff = specific value), the points above the FDR cutoff line but within the FC cutoff line are non-significantly abundant proteins (colored in black)/}
#' \item{Heatmap : illustrate up-/down-regulated proteins for multiple comparisons with all proteins. Each column represents each comparison of interest. Each row represents each protein. Color red/blue represents proteins in that specific comparison are significantly up-regulated/down-regulated proteins with FDR cutoff and/or FC cutoff. The color scheme shows the evidences of significance. The darker color it is, the stronger evidence of significance it has. Color gold represents proteins are not significantly different in abundance.}
#' \item{Comparison plot : illustrate log-fold change and its variation of multiple comparisons for single protein. X-axis is comparison of interest. Y-axis is the log fold change. The red points are the estimated log fold change from the model. The blue error bars are the confidence interval with 0.95 significant level for log fold change. This interval is only based on the standard error, which is estimated from the model. }
#' }
-#'
+#'
#' @importFrom gplots heatmap.2
#' @importFrom stats hclust
#' @importFrom ggrepel geom_text_repel
#' @importFrom marray maPalette
#' @importFrom plotly ggplotly style add_trace plot_ly subplot
-#'
+#'
#' @export
-#'
+#'
#' @examples
-#' QuantData<-dataProcess(SRMRawData, use_log_file = FALSE)
+#' QuantData <- dataProcess(SRMRawData, use_log_file = FALSE)
#' head(QuantData$FeatureLevelData)
#' ## based on multiple comparisons (T1 vs T3; T1 vs T7; T1 vs T9)
-#' comparison1<-matrix(c(-1,0,1,0,0,0,0,0,0,0),nrow=1)
-#' comparison2<-matrix(c(-1,0,0,0,0,0,1,0,0,0),nrow=1)
-#' comparison3<-matrix(c(-1,0,0,0,0,0,0,0,1,0),nrow=1)
-#' comparison<-rbind(comparison1,comparison2, comparison3)
-#' row.names(comparison)<-c("T3-T1","T7-T1","T9-T1")
-#' groups = levels(QuantData$ProteinLevelData$GROUP)
+#' comparison1 <- matrix(c(-1, 0, 1, 0, 0, 0, 0, 0, 0, 0), nrow = 1)
+#' comparison2 <- matrix(c(-1, 0, 0, 0, 0, 0, 1, 0, 0, 0), nrow = 1)
+#' comparison3 <- matrix(c(-1, 0, 0, 0, 0, 0, 0, 0, 1, 0), nrow = 1)
+#' comparison <- rbind(comparison1, comparison2, comparison3)
+#' row.names(comparison) <- c("T3-T1", "T7-T1", "T9-T1")
+#' groups <- levels(QuantData$ProteinLevelData$GROUP)
#' colnames(comparison) <- groups[order(as.numeric(groups))]
-#' testResultMultiComparisons<-groupComparison(contrast.matrix=comparison,
-#' data=QuantData,
-#' use_log_file = FALSE)
+#' testResultMultiComparisons <- groupComparison(
+#' contrast.matrix = comparison,
+#' data = QuantData,
+#' use_log_file = FALSE
+#' )
#' testResultMultiComparisons$ComparisonResult
#' # Volcano plot with FDR cutoff = 0.05 and no FC cutoff
-#' groupComparisonPlots(data=testResultMultiComparisons$ComparisonResult, type="VolcanoPlot",
-#' logBase.pvalue=2, address="Ex1_")
-#' # Volcano plot with FDR cutoff = 0.05, FC cutoff = 70, upper y-axis limit = 100,
+#' groupComparisonPlots(
+#' data = testResultMultiComparisons$ComparisonResult, type = "VolcanoPlot",
+#' logBase.pvalue = 2, address = "Ex1_"
+#' )
+#' # Volcano plot with FDR cutoff = 0.05, FC cutoff = 70, upper y-axis limit = 100,
#' # and no protein name displayed
#' # FCcutoff=70 is for demonstration purpose
-#' groupComparisonPlots(data=testResultMultiComparisons$ComparisonResult, type="VolcanoPlot",
-#' FCcutoff=70, logBase.pvalue=2, ylimUp=100, ProteinName=FALSE,address="Ex2_")
+#' groupComparisonPlots(
+#' data = testResultMultiComparisons$ComparisonResult, type = "VolcanoPlot",
+#' FCcutoff = 70, logBase.pvalue = 2, ylimUp = 100, ProteinName = FALSE, address = "Ex2_"
+#' )
#' # Heatmap with FDR cutoff = 0.05
-#' groupComparisonPlots(data=testResultMultiComparisons$ComparisonResult, type="Heatmap",
-#' logBase.pvalue=2, address="Ex1_")
+#' groupComparisonPlots(
+#' data = testResultMultiComparisons$ComparisonResult, type = "Heatmap",
+#' logBase.pvalue = 2, address = "Ex1_"
+#' )
#' # Heatmap with FDR cutoff = 0.05 and FC cutoff = 70
#' # FCcutoff=70 is for demonstration purpose
-#' groupComparisonPlots(data=testResultMultiComparisons$ComparisonResult, type="Heatmap",
-#' FCcutoff=70, logBase.pvalue=2, address="Ex2_")
+#' groupComparisonPlots(
+#' data = testResultMultiComparisons$ComparisonResult, type = "Heatmap",
+#' FCcutoff = 70, logBase.pvalue = 2, address = "Ex2_"
+#' )
#' # Comparison Plot
-#' groupComparisonPlots(data=testResultMultiComparisons$ComparisonResult, type="ComparisonPlot",
-#' address="Ex1_")
+#' groupComparisonPlots(
+#' data = testResultMultiComparisons$ComparisonResult, type = "ComparisonPlot",
+#' address = "Ex1_"
+#' )
#' # Comparison Plot
-#' groupComparisonPlots(data=testResultMultiComparisons$ComparisonResult, type="ComparisonPlot",
-#' ylimUp=8, ylimDown=-1, address="Ex2_")
-#'
-groupComparisonPlots = function(
+#' groupComparisonPlots(
+#' data = testResultMultiComparisons$ComparisonResult, type = "ComparisonPlot",
+#' ylimUp = 8, ylimDown = -1, address = "Ex2_"
+#' )
+#'
+groupComparisonPlots <- function(
data, type, sig = 0.05, FCcutoff = FALSE, logBase.pvalue = 10, ylimUp = FALSE,
- ylimDown = FALSE, xlimUp = FALSE, x.axis.size = 10, y.axis.size = 10,
- dot.size = 3, text.size = 4, text.angle = 0, legend.size = 13,
- ProteinName = TRUE, colorkey = TRUE, numProtein = 100, clustering = "both",
+ ylimDown = FALSE, xlimUp = FALSE, x.axis.size = 10, y.axis.size = 10,
+ dot.size = 3, text.size = 4, text.angle = 0, legend.size = 13,
+ ProteinName = TRUE, colorkey = TRUE, numProtein = 100, clustering = "both",
width = 800, height = 600, which.Comparison = "all", which.Protein = "all",
- address = "", isPlotly=FALSE
-) {
- Label = Protein = NULL
-
- type = toupper(type)
- input = data.table::as.data.table(data)
- all_labels = as.character(unique(data$Label))
- log_base_FC = ifelse(is.element("log2FC", colnames(data)), 2, 10)
-
- chosen_labels = .checkGCPlotsInput(type, logBase.pvalue, which.Comparison,
- all_labels)
- input = input[Label %in% chosen_labels]
- input[, Protein := factor(Protein)]
- input[, Label := factor(Label)]
-
- if(type == "HEATMAP" || type == "COMPARISONPLOT") {
- warning("Avoid plotting all proteins as it can take a large amount of time
+ address = "", isPlotly = FALSE) {
+ Label <- Protein <- NULL
+
+ type <- toupper(type)
+ input <- data.table::as.data.table(data)
+ all_labels <- as.character(unique(data$Label))
+ log_base_FC <- ifelse(is.element("log2FC", colnames(data)), 2, 10)
+
+ chosen_labels <- .checkGCPlotsInput(
+ type, logBase.pvalue, which.Comparison,
+ all_labels
+ )
+ input <- input[Label %in% chosen_labels]
+ input[, Protein := factor(Protein)]
+ input[, Label := factor(Label)]
+
+ if (type == "HEATMAP" || type == "COMPARISONPLOT") {
+ warning("Avoid plotting all proteins as it can take a large amount of time
to download the files")
- }
-
- if(isPlotly & address != FALSE) {
- print("Plots will be saved as .HTML file as plotly is selected, set isPlotly = FALSE, if
+ }
+
+ if (isPlotly & address != FALSE) {
+ print("Plots will be saved as .HTML file as plotly is selected, set isPlotly = FALSE, if
you want to generate PDF using ggplot2")
+ }
+
+ if (type == "HEATMAP") {
+ plotly_plot <- .plotHeatmap(
+ input, logBase.pvalue, ylimUp, FCcutoff, sig, clustering,
+ numProtein, colorkey, width, height, log_base_FC,
+ x.axis.size, y.axis.size, address, isPlotly
+ )
+ if (isPlotly) {
+ if (address != FALSE) {
+ .savePlotlyPlotHTML(list(plotly_plot), address, "Heatmap", width, height)
+ }
+ plotly_plot
}
-
- if (type == "HEATMAP") {
- plotly_plot <- .plotHeatmap(input, logBase.pvalue, ylimUp, FCcutoff, sig, clustering,
- numProtein, colorkey, width, height, log_base_FC,
- x.axis.size, y.axis.size, address, isPlotly)
- if(isPlotly) {
- if(address != FALSE) {
- .savePlotlyPlotHTML(list(plotly_plot),address,"Heatmap" ,width, height)
- }
- plotly_plot
- }
- }
- else if (type == "VOLCANOPLOT") {
- plots <- .plotVolcano(input, which.Comparison, address, width, height, logBase.pvalue,
- ylimUp, ylimDown, FCcutoff, sig, xlimUp, ProteinName, dot.size,
- text.size, legend.size, x.axis.size, y.axis.size, log_base_FC, isPlotly)
- plotly_plots <- vector("list", length(plots))
- if(isPlotly) {
- for(i in seq_along(plots)) {
- plot <- plots[[i]]
- plotly_plot <- .convertGgplot2Plotly(plot,tips=c("Protein","logFC","log10adjp","log2adjp"))
- plotly_plot <- .fixLegendPlotlyPlotsVolcano(plotly_plot)
- plotly_plots[[i]] = list(plotly_plot)
- }
- if(address != FALSE) {
- .savePlotlyPlotHTML(plotly_plots,address,"VolcanoPlot" ,width, height)
- }
- plotly_plots <- unlist(plotly_plots, recursive = FALSE)
- plotly_plots
- }
+ } else if (type == "VOLCANOPLOT") {
+ plots <- .plotVolcano(
+ input, which.Comparison, address, width, height, logBase.pvalue,
+ ylimUp, ylimDown, FCcutoff, sig, xlimUp, ProteinName, dot.size,
+ text.size, legend.size, x.axis.size, y.axis.size, log_base_FC, isPlotly
+ )
+ plotly_plots <- vector("list", length(plots))
+ if (isPlotly) {
+ for (i in seq_along(plots)) {
+ plot <- plots[[i]]
+ plotly_plot <- .convertGgplot2Plotly(plot, tips = c("Protein", "logFC", "log10adjp", "log2adjp"))
+ plotly_plot <- .fixLegendPlotlyPlotsVolcano(plotly_plot)
+ plotly_plots[[i]] <- list(plotly_plot)
+ }
+ if (address != FALSE) {
+ .savePlotlyPlotHTML(plotly_plots, address, "VolcanoPlot", width, height)
+ }
+ plotly_plots <- unlist(plotly_plots, recursive = FALSE)
+ plotly_plots
}
- else if (type == "COMPARISONPLOT") {
- plots <- .plotComparison(input, which.Protein, address, width, height, sig, ylimUp,
- ylimDown, text.angle, dot.size, x.axis.size, y.axis.size,
- log_base_FC, isPlotly)
- plotly_plots <- vector("list", length(plots))
- if(isPlotly) {
- for(i in seq_along(plots)) {
- plot <- plots[[i]]
- plotly_plot <- .convertGgplot2Plotly(plot,tips=c("logFC"))
- plotly_plots[[i]] = list(plotly_plot)
- }
- if(address != FALSE) {
- .savePlotlyPlotHTML(plotly_plots,address,"ComparisonPlot" ,width, height)
- }
- plotly_plots <- unlist(plotly_plots, recursive = FALSE)
- plotly_plots
- }
+ } else if (type == "COMPARISONPLOT") {
+ plots <- .plotComparison(
+ input, which.Protein, address, width, height, sig, ylimUp,
+ ylimDown, text.angle, dot.size, x.axis.size, y.axis.size,
+ log_base_FC, isPlotly
+ )
+ plotly_plots <- vector("list", length(plots))
+ if (isPlotly) {
+ for (i in seq_along(plots)) {
+ plot <- plots[[i]]
+ plotly_plot <- .convertGgplot2Plotly(plot, tips = c("logFC"))
+ plotly_plots[[i]] <- list(plotly_plot)
+ }
+ if (address != FALSE) {
+ .savePlotlyPlotHTML(plotly_plots, address, "ComparisonPlot", width, height)
+ }
+ plotly_plots <- unlist(plotly_plots, recursive = FALSE)
+ plotly_plots
}
+ }
}
@@ -177,121 +196,120 @@ groupComparisonPlots = function(
#' @param log_base_pval log base for p-values
#' @param log_base_FC log base for log-fold changes - 2 or 10
#' @keywords internal
-.plotHeatmap = function(
- input, log_base_pval, ylimUp, FCcutoff, sig, clustering, numProtein, colorkey,
- width, height, log_base_FC, x.axis.size, y.axis.size, address, isPlotly
-) {
- adj.pvalue = heat_val = NULL
-
- if (length(unique(input$Protein)) <= 1) {
- stop("At least two proteins are needed for heatmaps.")
- }
- if (length(unique(input$Label)) <= 1) {
- stop("At least two comparisons are needed for heatmaps.")
- }
-
- if (is.numeric(ylimUp)) {
- y.limUp = ylimUp
- } else {
- y.limUp = ifelse(log_base_pval == 2, 30, 10)
- input[adj.pvalue < log_base_pval ^ (-y.limUp), adj.pvalue := log_base_pval ^ (-y.limUp)]
- }
-
- if (is.numeric(FCcutoff)) {
- input$adj.pvalue = ifelse(input[, 3] < log(FCcutoff, log_base_FC) & input[, 3] > -log(FCcutoff, log_base_FC),
- 1, input$adj.pvalue)
- }
-
- input[, heat_val := -log(adj.pvalue, log_base_pval) * sign(input[, 3])]
- wide = data.table::dcast(input, Protein ~ Label,
- value.var = "heat_val")
- proteins = wide$Protein
- wide = as.matrix(wide[, -1])
- rownames(wide) = proteins
- wide = wide[rowSums(!is.na(wide)) != 0, colSums(!is.na(wide)) != 0]
- wide = .getOrderedMatrix(wide, clustering)
- up = 10
- temp = 10 ^ (-sort(ceiling(seq(2, up, length = 10)[c(1, 2, 3, 5, 10)]), decreasing = TRUE))
- breaks = c(temp, sig)
- neg.breaks = log(breaks, log_base_pval)
- my.breaks = c(neg.breaks, 0, -neg.breaks[6:1], 101)
- blocks = c(-breaks, 1, breaks[6:1])
- namepro = rownames(wide)
- totalpro = length(namepro)
- numheatmap = totalpro %/% numProtein + 1
-
- my.colors = maPalette(low = "blue", high = "red", mid = "black", k = 12)
- my.colors = c(my.colors,"grey")
- blocks = c(blocks,"NA")
- my.colors.rgb = lapply(my.colors, function(x) as.vector(col2rgb(x)))
- color.key.plotly = .getColorKeyPlotly(my.colors.rgb, blocks)
-
- if(!isPlotly) {
- if(colorkey) {
- .getColorKeyGGPlot2(my.colors, blocks)
- }
- savePlot(address, "Heatmap", width, height)
- }
- blue.red.18 = maPalette(low = "blue", high = "red", mid = "black", k = 14)
- if(isPlotly) {
- # If plotly, plot all proteins on a single heatmap
- heatmap = .makeHeatmapPlotly(wide, blue.red.18, my.breaks, x.axis.size, y.axis.size, height, numProtein)
- } else {
- for (j in seq_len(numheatmap)) {
- if (j != numheatmap) {
- partial_wide = wide[((j - 1) * numProtein + 1):(j * numProtein), ]
- } else {
- partial_wide = wide[((j - 1) * numProtein + 1):nrow(wide), ]
- }
- heatmap = .makeHeatmapGgplot2(partial_wide, my.colors, my.breaks, x.axis.size, y.axis.size, height)
- }
+.plotHeatmap <- function(
+ input, log_base_pval, ylimUp, FCcutoff, sig, clustering, numProtein, colorkey,
+ width, height, log_base_FC, x.axis.size, y.axis.size, address, isPlotly) {
+ adj.pvalue <- heat_val <- NULL
+
+ if (length(unique(input$Protein)) <= 1) {
+ stop("At least two proteins are needed for heatmaps.")
+ }
+ if (length(unique(input$Label)) <= 1) {
+ stop("At least two comparisons are needed for heatmaps.")
+ }
+
+ if (is.numeric(ylimUp)) {
+ y.limUp <- ylimUp
+ } else {
+ y.limUp <- ifelse(log_base_pval == 2, 30, 10)
+ input[adj.pvalue < log_base_pval^(-y.limUp), adj.pvalue := log_base_pval^(-y.limUp)]
+ }
+
+ if (is.numeric(FCcutoff)) {
+ input$adj.pvalue <- ifelse(input[, 3] < log(FCcutoff, log_base_FC) & input[, 3] > -log(FCcutoff, log_base_FC),
+ 1, input$adj.pvalue
+ )
+ }
+
+ input[, heat_val := -log(adj.pvalue, log_base_pval) * sign(input[, 3])]
+ wide <- data.table::dcast(input, Protein ~ Label,
+ value.var = "heat_val"
+ )
+ proteins <- wide$Protein
+ wide <- as.matrix(wide[, -1])
+ rownames(wide) <- proteins
+ wide <- wide[rowSums(!is.na(wide)) != 0, colSums(!is.na(wide)) != 0]
+ wide <- .getOrderedMatrix(wide, clustering)
+ up <- 10
+ temp <- 10^(-sort(ceiling(seq(2, up, length = 10)[c(1, 2, 3, 5, 10)]), decreasing = TRUE))
+ breaks <- c(temp, sig)
+ neg.breaks <- log(breaks, log_base_pval)
+ my.breaks <- c(neg.breaks, 0, -neg.breaks[6:1], 101)
+ blocks <- c(-breaks, 1, breaks[6:1])
+ namepro <- rownames(wide)
+ totalpro <- length(namepro)
+ numheatmap <- totalpro %/% numProtein + 1
+
+ my.colors <- maPalette(low = "blue", high = "red", mid = "black", k = 12)
+ my.colors <- c(my.colors, "grey")
+ blocks <- c(blocks, "NA")
+ my.colors.rgb <- lapply(my.colors, function(x) as.vector(col2rgb(x)))
+ color.key.plotly <- .getColorKeyPlotly(my.colors.rgb, blocks)
+
+ if (!isPlotly) {
+ if (colorkey) {
+ .getColorKeyGGPlot2(my.colors, blocks)
}
-
-
-
- if (address != FALSE) {
- dev.off()
+ savePlot(address, "Heatmap", width, height)
+ }
+ blue.red.18 <- maPalette(low = "blue", high = "red", mid = "black", k = 14)
+ if (isPlotly) {
+ # If plotly, plot all proteins on a single heatmap
+ heatmap <- .makeHeatmapPlotly(wide, blue.red.18, my.breaks, x.axis.size, y.axis.size, height, numProtein)
+ } else {
+ for (j in seq_len(numheatmap)) {
+ if (j != numheatmap) {
+ partial_wide <- wide[((j - 1) * numProtein + 1):(j * numProtein), ]
+ } else {
+ partial_wide <- wide[((j - 1) * numProtein + 1):nrow(wide), ]
+ }
+ heatmap <- .makeHeatmapGgplot2(partial_wide, my.colors, my.breaks, x.axis.size, y.axis.size, height)
}
- if(isPlotly) {
- if(colorkey) {
- heatmap_and_color_key <- subplot(heatmap, color.key.plotly, nrows = 2)
-
- heatmap_and_color_key <- plotly::layout(
- heatmap_and_color_key,
- annotations = list(
- list(
- x = 0.5,
- y = 1.1,
- text = "Heatmap",
- showarrow = FALSE,
- xref = "paper",
- yref = "paper",
- font = list(
- size = 18
- )
- ),
- list(
- x = 0.5,
- y = 0.35,
- text = "Color Key",
- showarrow = FALSE,
- xref = "paper",
- yref = "paper",
- font = list(
- size = 18
- )
- )
- ),
- margin = list(l = 50, r = 50, b = 50, t = 50)
+ }
+
+
+
+ if (address != FALSE) {
+ dev.off()
+ }
+ if (isPlotly) {
+ if (colorkey) {
+ heatmap_and_color_key <- subplot(heatmap, color.key.plotly, nrows = 2)
+
+ heatmap_and_color_key <- plotly::layout(
+ heatmap_and_color_key,
+ annotations = list(
+ list(
+ x = 0.5,
+ y = 1.1,
+ text = "Heatmap",
+ showarrow = FALSE,
+ xref = "paper",
+ yref = "paper",
+ font = list(
+ size = 18
+ )
+ ),
+ list(
+ x = 0.5,
+ y = 0.35,
+ text = "Color Key",
+ showarrow = FALSE,
+ xref = "paper",
+ yref = "paper",
+ font = list(
+ size = 18
)
- heatmap_and_color_key
- }
- else {
- heatmap
- }
+ )
+ ),
+ margin = list(l = 50, r = 50, b = 50, t = 50)
+ )
+ heatmap_and_color_key
+ } else {
+ heatmap
}
-
-}
+ }
+}
@@ -299,85 +317,91 @@ groupComparisonPlots = function(
#' Preprocess data for volcano plots and create them
#' @inheritParams groupComparisonPlots
#' @keywords internal
-.plotVolcano = function(
+.plotVolcano <- function(
input, which.Comparison, address, width, height, log_base_pval,
ylimUp, ylimDown, FCcutoff, sig, xlimUp, ProteinName, dot.size,
- text.size, legend.size, x.axis.size, y.axis.size, log_base_FC, isPlotly
-) {
- adj.pvalue = colgroup = logFC = Protein = issue = Label = newlogFC = NULL
-
- log_adjp = paste0("log", log_base_pval, "adjp")
- all_labels = unique(input$Label)
- input = input[!is.na(adj.pvalue), ]
- colname_log_fc = intersect(colnames(input), c("log2FC", "log10FC"))
- data.table::setnames(input, colname_log_fc, c("logFC"))
-
- if (address == FALSE) {
- if (which.Comparison == "all") {
- if (length(unique(input$Label)) > 1) {
- stop('** Cannnot generate all volcano plots in a screen. Please set one comparison at a time.')
- }
- } else if (length(which.Comparison) > 1) {
- stop( '** Cannnot generate multiple volcano plots in a screen. Please set one comparison at a time.' )
- }
- }
-
- if (is.numeric(ylimUp)) {
- y.limUp = ylimUp
- } else {
- y.limUp = ifelse(log_base_pval == 2, 30, 10)
- }
- input[, adj.pvalue := ifelse(adj.pvalue < log_base_pval ^ (-y.limUp),
- log_base_pval ^ (-y.limUp), adj.pvalue)]
-
- if (!FCcutoff) {
- logFC_cutoff = 0
- } else {
- logFC_cutoff = log(FCcutoff, log_base_FC)
- }
- input[, colgroup := ifelse(adj.pvalue <= sig & logFC >= logFC_cutoff, "red",
- ifelse(adj.pvalue <= sig & logFC <= -logFC_cutoff, "blue",
- "black"))]
- input[, colgroup := factor(colgroup, levels = c("black", "blue", "red"))]
- input[, Protein := as.character(Protein)]
- input[!is.na(issue) & issue == "oneConditionMissing",
- Protein := paste0("*", Protein)]
- if(!isPlotly) {
- savePlot(address, "VolcanoPlot", width, height)
- }
- plots <- vector("list", length(all_labels))
- for (i in seq_along(all_labels)) {
- label_name = all_labels[i]
- single_label = input[Label == label_name, ]
-
- y.limup = ceiling(max(-log(single_label[!is.na(single_label$adj.pvalue), "adj.pvalue"], log_base_pval)))
- if (y.limup < (-log(sig, log_base_pval))) {
- y.limup = (-log(sig, log_base_pval) + 1) ## for too small y.lim
- }
- y.limdown = ifelse(is.numeric(ylimDown), ylimDown, 0)
- x_ceiling = ceiling(max(abs(single_label[!is.na(single_label$logFC) & is.finite(single_label$logFC), logFC])))
- x.lim = ifelse(is.numeric(xlimUp), xlimUp, ifelse((x_ceiling < 3), 3, x_ceiling))
-
- single_label[[log_adjp]] = -log(single_label$adj.pvalue, log_base_pval)
- single_label$newlogFC = single_label$logFC
- single_label[!is.na(issue) &
- issue == "oneConditionMissing" &
- logFC == Inf, newlogFC := (x.lim - 0.2)]
- single_label[!is.na(issue) &
- issue == "oneConditionMissing" &
- logFC == (-Inf), newlogFC := (x.lim - 0.2) * (-1)]
- plot = .makeVolcano(single_label, label_name, log_base_FC, log_base_pval, x.lim, ProteinName, dot.size,
- y.limdown, y.limup, text.size, FCcutoff, sig, x.axis.size, y.axis.size,
- legend.size, log_adjp)
- print(plot)
- plots[[i]] = plot
- }
- if (address != FALSE) {
- dev.off()
+ text.size, legend.size, x.axis.size, y.axis.size, log_base_FC, isPlotly) {
+ adj.pvalue <- colgroup <- logFC <- Protein <- issue <- Label <- newlogFC <- NULL
+
+ log_adjp <- paste0("log", log_base_pval, "adjp")
+ all_labels <- unique(input$Label)
+ input <- input[!is.na(adj.pvalue), ]
+ colname_log_fc <- intersect(colnames(input), c("log2FC", "log10FC"))
+ data.table::setnames(input, colname_log_fc, c("logFC"))
+
+ if (address == FALSE) {
+ if (which.Comparison == "all") {
+ if (length(unique(input$Label)) > 1) {
+ stop("** Cannnot generate all volcano plots in a screen. Please set one comparison at a time.")
+ }
+ } else if (length(which.Comparison) > 1) {
+ stop("** Cannnot generate multiple volcano plots in a screen. Please set one comparison at a time.")
}
- if (isPlotly) {
- plots
+ }
+
+ if (is.numeric(ylimUp)) {
+ y.limUp <- ylimUp
+ } else {
+ y.limUp <- ifelse(log_base_pval == 2, 30, 10)
+ }
+ input[, adj.pvalue := ifelse(adj.pvalue < log_base_pval^(-y.limUp),
+ log_base_pval^(-y.limUp), adj.pvalue
+ )]
+
+ if (!FCcutoff) {
+ logFC_cutoff <- 0
+ } else {
+ logFC_cutoff <- log(FCcutoff, log_base_FC)
+ }
+ input[, colgroup := ifelse(adj.pvalue <= sig & logFC >= logFC_cutoff, "red",
+ ifelse(adj.pvalue <= sig & logFC <= -logFC_cutoff, "blue",
+ "black"
+ )
+ )]
+ input[, colgroup := factor(colgroup, levels = c("black", "blue", "red"))]
+ input[, Protein := as.character(Protein)]
+ input[
+ !is.na(issue) & issue == "oneConditionMissing",
+ Protein := paste0("*", Protein)
+ ]
+ if (!isPlotly) {
+ savePlot(address, "VolcanoPlot", width, height)
+ }
+ plots <- vector("list", length(all_labels))
+ for (i in seq_along(all_labels)) {
+ label_name <- all_labels[i]
+ single_label <- input[Label == label_name, ]
+
+ y.limup <- ceiling(max(-log(single_label[!is.na(single_label$adj.pvalue), "adj.pvalue"], log_base_pval)))
+ if (y.limup < (-log(sig, log_base_pval))) {
+ y.limup <- (-log(sig, log_base_pval) + 1) ## for too small y.lim
}
+ y.limdown <- ifelse(is.numeric(ylimDown), ylimDown, 0)
+ x_ceiling <- ceiling(max(abs(single_label[!is.na(single_label$logFC) & is.finite(single_label$logFC), logFC])))
+ x.lim <- ifelse(is.numeric(xlimUp), xlimUp, ifelse((x_ceiling < 3), 3, x_ceiling))
+
+ single_label[[log_adjp]] <- -log(single_label$adj.pvalue, log_base_pval)
+ single_label$newlogFC <- single_label$logFC
+ single_label[!is.na(issue) &
+ issue == "oneConditionMissing" &
+ logFC == Inf, newlogFC := (x.lim - 0.2)]
+ single_label[!is.na(issue) &
+ issue == "oneConditionMissing" &
+ logFC == (-Inf), newlogFC := (x.lim - 0.2) * (-1)]
+ plot <- .makeVolcano(
+ single_label, label_name, log_base_FC, log_base_pval, x.lim, ProteinName, dot.size,
+ y.limdown, y.limup, text.size, FCcutoff, sig, x.axis.size, y.axis.size,
+ legend.size, log_adjp
+ )
+ print(plot)
+ plots[[i]] <- plot
+ }
+ if (address != FALSE) {
+ dev.off()
+ }
+ if (isPlotly) {
+ plots
+ }
}
#' Preprocess data for comparison plots and create them
@@ -385,51 +409,52 @@ groupComparisonPlots = function(
#' @param input data.table
#' @param log_base_FC log base for log-fold changes - 2 or 10
#' @keywords internal
-.plotComparison = function(
+.plotComparison <- function(
input, proteins, address, width, height, sig, ylimUp, ylimDown,
- text.angle, dot.size, x.axis.size, y.axis.size, log_base_FC, isPlotly
-) {
- adj.pvalue = Protein = ciw = NULL
-
- input = input[!is.na(adj.pvalue), ]
- all_proteins = unique(input$Protein)
-
- if (address == FALSE) {
- if (proteins == "all" | length(proteins) > 1) {
- stop("** Cannnot generate all comparison plots in a screen. Please set one protein at a time.")
- }
- }
- if (proteins != "all") {
- selected_proteins = getSelectedProteins(proteins, all_proteins)
- input = input[Protein %in% selected_proteins, ]
- }
-
- all_proteins = unique(input$Protein)
- input$Protein = factor(input$Protein)
- if(!isPlotly) {
- savePlot(address, "ComparisonPlot", width, height)
- }
- plots <- vector("list", length(all_proteins))
- log_fc_column = intersect(colnames(input), c("log2FC", "log10FC"))
- for (i in seq_along(all_proteins)) {
- single_protein = input[Protein == all_proteins[i], ]
- single_protein[, ciw := qt(1 - sig / (2 * nrow(single_protein)), single_protein$DF) * single_protein$SE]
- data.table::setnames(single_protein, log_fc_column, "logFC")
- y.limup = ifelse(is.numeric(ylimUp), ylimUp, ceiling(max(single_protein$logFC + single_protein$ciw)))
- y.limdown = ifelse(is.numeric(ylimDown), ylimDown, floor(min(single_protein$logFC - single_protein$ciw)))
- hjust = ifelse(text.angle != 0, 1, 0.5)
- vjust = ifelse(text.angle != 0, 1, 0.5)
-
- plot = .makeComparison(single_protein, log_base_FC, dot.size, x.axis.size,
- y.axis.size, text.angle, hjust, vjust, y.limdown,
- y.limup)
- print(plot)
- plots[[i]] = plot
- }
- if (address != FALSE) {
- dev.off()
- }
- if (isPlotly) {
- plots
+ text.angle, dot.size, x.axis.size, y.axis.size, log_base_FC, isPlotly) {
+ adj.pvalue <- Protein <- ciw <- NULL
+
+ input <- input[!is.na(adj.pvalue), ]
+ all_proteins <- unique(input$Protein)
+
+ if (address == FALSE) {
+ if (proteins == "all" | length(proteins) > 1) {
+ stop("** Cannnot generate all comparison plots in a screen. Please set one protein at a time.")
}
+ }
+ if (proteins != "all") {
+ selected_proteins <- getSelectedProteins(proteins, all_proteins)
+ input <- input[Protein %in% selected_proteins, ]
+ }
+
+ all_proteins <- unique(input$Protein)
+ input$Protein <- factor(input$Protein)
+ if (!isPlotly) {
+ savePlot(address, "ComparisonPlot", width, height)
+ }
+ plots <- vector("list", length(all_proteins))
+ log_fc_column <- intersect(colnames(input), c("log2FC", "log10FC"))
+ for (i in seq_along(all_proteins)) {
+ single_protein <- input[Protein == all_proteins[i], ]
+ single_protein[, ciw := qt(1 - sig / (2 * nrow(single_protein)), single_protein$DF) * single_protein$SE]
+ data.table::setnames(single_protein, log_fc_column, "logFC")
+ y.limup <- ifelse(is.numeric(ylimUp), ylimUp, ceiling(max(single_protein$logFC + single_protein$ciw)))
+ y.limdown <- ifelse(is.numeric(ylimDown), ylimDown, floor(min(single_protein$logFC - single_protein$ciw)))
+ hjust <- ifelse(text.angle != 0, 1, 0.5)
+ vjust <- ifelse(text.angle != 0, 1, 0.5)
+
+ plot <- .makeComparison(
+ single_protein, log_base_FC, dot.size, x.axis.size,
+ y.axis.size, text.angle, hjust, vjust, y.limdown,
+ y.limup
+ )
+ print(plot)
+ plots[[i]] <- plot
+ }
+ if (address != FALSE) {
+ dev.off()
+ }
+ if (isPlotly) {
+ plots
+ }
}
diff --git a/R/groupComparisonQCPlots.R b/R/groupComparisonQCPlots.R
index 9934e980..36291ae7 100644
--- a/R/groupComparisonQCPlots.R
+++ b/R/groupComparisonQCPlots.R
@@ -1,266 +1,289 @@
#' Visualization for model-based quality control in fitting model
-#'
-#' @description To check the assumption of linear model for whole plot inference,
-#' modelBasedQCPlots takes the results after fitting models from function
-#' (\code{\link{groupComparison}}) as input and automatically generate two types
-#' of figures in pdf files as output:
+#'
+#' @description To check the assumption of linear model for whole plot inference,
+#' modelBasedQCPlots takes the results after fitting models from function
+#' (\code{\link{groupComparison}}) as input and automatically generate two types
+#' of figures in pdf files as output:
#' (1) normal quantile-quantile plot (specify "QQPlot" in option type) for checking
-#' normally distributed errors.;
+#' normally distributed errors.;
#' (2) residual plot (specify "ResidualPlot" in option type).
-#'
+#'
#' @param data output from function groupComparison.
-#' @param type choice of visualization. "QQPlots" represents normal quantile-quantile
+#' @param type choice of visualization. "QQPlots" represents normal quantile-quantile
#' plot for each protein after fitting models. "ResidualPlots" represents a plot
#' of residuals versus fitted values for each protein in the dataset.
#' @param axis.size size of axes labels. Default is 10.
#' @param dot.size size of points in the graph for residual plots and QQ plots. Default is 3.
#' @param width width of the saved file. Default is 10.
#' @param height height of the saved file. Default is 10.
-#' @param which.Protein Protein list to draw plots. List can be names of Proteins
-#' or order numbers of Proteins from levels(testResultOneComparison$ComparisonResult$Protein).
+#' @param which.Protein Protein list to draw plots. List can be names of Proteins
+#' or order numbers of Proteins from levels(testResultOneComparison$ComparisonResult$Protein).
#' Default is "all", which generates all plots for each protein.
#' @param address name that will serve as a prefix to the name of output file.
-#'
-#' @details Results based on statistical models for whole plot level inference are
-#' accurate as long as the assumptions of the model are met. The model assumes that
-#' the measurement errors are normally distributed with mean 0 and constant variance.
+#'
+#' @details Results based on statistical models for whole plot level inference are
+#' accurate as long as the assumptions of the model are met. The model assumes that
+#' the measurement errors are normally distributed with mean 0 and constant variance.
#' The assumption of a constant variance can be checked by examining the residuals from the model.
#' \itemize{
#' \item{QQPlots : a normal quantile-quantile plot for each protein is generated in order to check whether the errors are well approximated by a normal distribution. If points fall approximately along a straight line, then the assumption is appropriate for that protein. Only large deviations from the line are problematic.}
#' \item{ResidualPlots : The plots of residuals against predicted(fitted) values. If it shows a random scatter, then the assumption is appropriate.}
#' }
-#'
+#'
#' @return produce a pdf file
-#'
+#'
#' @export
-#'
+#'
#' @examples
#' QuantData <- dataProcess(SRMRawData, use_log_file = FALSE)
#' head(QuantData$FeatureLevelData)
#' levels(QuantData$FeatureLevelData$GROUP)
-#' comparison <- matrix(c(-1,0,0,0,0,0,1,0,0,0),nrow=1)
+#' comparison <- matrix(c(-1, 0, 0, 0, 0, 0, 1, 0, 0, 0), nrow = 1)
#' row.names(comparison) <- "T7-T1"
#' colnames(comparison) <- unique(QuantData$ProteinLevelData$GROUP)
#' # Tests for differentially abundant proteins with models:
#' # label-based SRM experiment with expanded scope of biological replication.
-#' testResultOneComparison <- groupComparison(contrast.matrix=comparison, data=QuantData,
-#' use_log_file = FALSE)
+#' testResultOneComparison <- groupComparison(
+#' contrast.matrix = comparison, data = QuantData,
+#' use_log_file = FALSE
+#' )
#' # normal quantile-quantile plots
-#' modelBasedQCPlots(data=testResultOneComparison, type="QQPlots", address="")
+#' modelBasedQCPlots(data = testResultOneComparison, type = "QQPlots", address = "")
#' # residual plots
-#' modelBasedQCPlots(data=testResultOneComparison, type="ResidualPlots", address="")
-#'
-
-modelBasedQCPlots = function(
- data, type, axis.size = 10, dot.size = 3, width = 10, height = 10,
- which.Protein = "all", address = "", displayDeprecationMessage = TRUE
-) {
- if (length(setdiff(toupper(type), c("QQPLOTS","RESIDUALPLOTS"))) != 0) {
- stop(paste0("Input for type=", type,
- ". However,'type' should be one of QQPlots, ResidualPlots."))
- }
- if (address == FALSE) {
- if(all(which.Protein == 'all')){
- stop('** Cannnot generate all plots in a screen. Please set one protein at a time.')
- } else if (length(which.Protein) > 1) {
- stop('** Cannnot generate multiple plots in a screen. Please set one protein at a time.')
- }
- }
-
- fitted_models = data[["FittedModel"]]
- all_proteins = levels(data$ComparisonResult$Protein)
- if (all(which.Protein != "all")) {
- selected_proteins = getSelectedProteins(which.Protein, all_proteins)
- fitted_models = fitted_models[all_proteins %in% selected_proteins]
- all_proteins = all_proteins[all_proteins %in% selected_proteins]
- }
-
- if (toupper(type) == "QQPLOTS") {
- .plotQQ(fitted_models, all_proteins, width, height, address,
- dot.size, axis.size)
- } else if (toupper(type) == "RESIDUALPLOTS") {
- .plotResiduals(fitted_models, all_proteins, width, height,
- address, dot.size, axis.size)
+#' modelBasedQCPlots(data = testResultOneComparison, type = "ResidualPlots", address = "")
+#'
+modelBasedQCPlots <- function(
+ data, type, axis.size = 10, dot.size = 3, width = 10, height = 10,
+ which.Protein = "all", address = "", displayDeprecationMessage = TRUE) {
+ if (length(setdiff(toupper(type), c("QQPLOTS", "RESIDUALPLOTS"))) != 0) {
+ stop(paste0(
+ "Input for type=", type,
+ ". However,'type' should be one of QQPlots, ResidualPlots."
+ ))
+ }
+ if (address == FALSE) {
+ if (all(which.Protein == "all")) {
+ stop("** Cannnot generate all plots in a screen. Please set one protein at a time.")
+ } else if (length(which.Protein) > 1) {
+ stop("** Cannnot generate multiple plots in a screen. Please set one protein at a time.")
}
+ }
- if(displayDeprecationMessage){
- msg_deprecation = paste("FUNCTION DEPRECATION NOTICE: We would like to",
- "notify you that the modelBasedQCPlots function",
- "currently available in both MSstats",
- "will undergo a transition process. Starting from release 4.14(TBD)",
- "the modelBasedQCPlots function in MSstats will be deprecated.",
- "Please take the necessary steps to update your codebase",
- "and migrate to MSstats::.groupComparisonQCPlots before",
- "release 4.14 to avoid any disruptions to your workflow.")
- message(msg_deprecation)
- }
+ fitted_models <- data[["FittedModel"]]
+ all_proteins <- levels(data$ComparisonResult$Protein)
+ if (all(which.Protein != "all")) {
+ selected_proteins <- getSelectedProteins(which.Protein, all_proteins)
+ fitted_models <- fitted_models[all_proteins %in% selected_proteins]
+ all_proteins <- all_proteins[all_proteins %in% selected_proteins]
+ }
+
+ if (toupper(type) == "QQPLOTS") {
+ .plotQQ(
+ fitted_models, all_proteins, width, height, address,
+ dot.size, axis.size
+ )
+ } else if (toupper(type) == "RESIDUALPLOTS") {
+ .plotResiduals(
+ fitted_models, all_proteins, width, height,
+ address, dot.size, axis.size
+ )
+ }
+
+ if (displayDeprecationMessage) {
+ msg_deprecation <- paste(
+ "FUNCTION DEPRECATION NOTICE: We would like to",
+ "notify you that the modelBasedQCPlots function",
+ "currently available in both MSstats",
+ "will undergo a transition process. Starting from release 4.14(TBD)",
+ "the modelBasedQCPlots function in MSstats will be deprecated.",
+ "Please take the necessary steps to update your codebase",
+ "and migrate to MSstats::.groupComparisonQCPlots before",
+ "release 4.14 to avoid any disruptions to your workflow."
+ )
+ message(msg_deprecation)
+ }
}
-#' @importFrom stats resid quantile qnorm
+#' @importFrom stats resid quantile qnorm
#' @importFrom utils setTxtProgressBar
-.plotQQ = function(fitted_models, all_proteins, width, height, address,
- dot.size, axis.size) {
- residual = NULL
- savePlot(address, "QQPlot", width, height)
- pb = utils::txtProgressBar(min = 0, max = length(fitted_models), style = 3)
- for (i in seq_along(fitted_models)) {
- sub = fitted_models[[i]]
- if(is.null(sub)){
- next()
- }
- if (is(sub, "lm")) {
- sub.residuals = sub$residuals
- } else {
- sub.residuals = resid(sub)
- }
- sub.residuals.table = data.frame("residual" = sub.residuals)
- ## get slope and intercept for qline
- y = quantile(sub.residuals.table$residual[!is.na(sub.residuals.table$residual)],
- c(0.25, 0.75))
- x = qnorm(c(0.25, 0.75))
- slope = diff(y) / diff(x)
- int = y[1L] - slope * x[1L]
-
- plot = ggplot(sub.residuals.table, aes(sample = residual)) +
- geom_point(stat = "qq", alpha = 0.8,
- shape = 20, size = dot.size) +
- scale_shape(solid=FALSE) +
- geom_abline(slope = slope, intercept = int, colour = "red") +
- scale_y_continuous("Sample Quantiles") +
- scale_x_continuous("Theoretical Quantiles") +
- labs(title=paste("Normal Q-Q Plot (", all_proteins[i], ")")) +
- theme(
- panel.background = element_rect(fill = "white", colour = "black"),
- panel.grid.major = element_line(colour = "grey95"),
- panel.grid.minor = element_blank(),
- axis.text.x = element_text(size = axis.size, colour = "black"),
- axis.text.y = element_text(size = axis.size, colour = "black"),
- axis.ticks = element_line(colour = "black"),
- axis.title.x = element_text(size = axis.size+5, vjust = -0.4),
- axis.title.y = element_text(size = axis.size+5, vjust = 0.3),
- title = element_text(size = axis.size + 8, vjust = 1.5),
- legend.position = "none"
- )
- print(plot)
- setTxtProgressBar(pb, i)
+.plotQQ <- function(fitted_models, all_proteins, width, height, address,
+ dot.size, axis.size) {
+ residual <- NULL
+ savePlot(address, "QQPlot", width, height)
+ pb <- utils::txtProgressBar(min = 0, max = length(fitted_models), style = 3)
+ for (i in seq_along(fitted_models)) {
+ sub <- fitted_models[[i]]
+ if (is.null(sub)) {
+ next()
}
- close(pb)
- if (address != FALSE) {
- dev.off()
+ if (is(sub, "lm")) {
+ sub.residuals <- sub$residuals
+ } else {
+ sub.residuals <- resid(sub)
}
+ sub.residuals.table <- data.frame("residual" = sub.residuals)
+ ## get slope and intercept for qline
+ y <- quantile(
+ sub.residuals.table$residual[!is.na(sub.residuals.table$residual)],
+ c(0.25, 0.75)
+ )
+ x <- qnorm(c(0.25, 0.75))
+ slope <- diff(y) / diff(x)
+ int <- y[1L] - slope * x[1L]
+
+ plot <- ggplot(sub.residuals.table, aes(sample = residual)) +
+ geom_point(
+ stat = "qq", alpha = 0.8,
+ shape = 20, size = dot.size
+ ) +
+ scale_shape(solid = FALSE) +
+ geom_abline(slope = slope, intercept = int, colour = "red") +
+ scale_y_continuous("Sample Quantiles") +
+ scale_x_continuous("Theoretical Quantiles") +
+ labs(title = paste("Normal Q-Q Plot (", all_proteins[i], ")")) +
+ theme(
+ panel.background = element_rect(fill = "white", colour = "black"),
+ panel.grid.major = element_line(colour = "grey95"),
+ panel.grid.minor = element_blank(),
+ axis.text.x = element_text(size = axis.size, colour = "black"),
+ axis.text.y = element_text(size = axis.size, colour = "black"),
+ axis.ticks = element_line(colour = "black"),
+ axis.title.x = element_text(size = axis.size + 5, vjust = -0.4),
+ axis.title.y = element_text(size = axis.size + 5, vjust = 0.3),
+ title = element_text(size = axis.size + 8, vjust = 1.5),
+ legend.position = "none"
+ )
+ print(plot)
+ setTxtProgressBar(pb, i)
+ }
+ close(pb)
+ if (address != FALSE) {
+ dev.off()
+ }
}
#' @importFrom stats resid fitted
#' @importFrom utils setTxtProgressBar
-.plotResiduals = function(fitted_models, all_proteins, width, height,
- address, dot.size, axis.size) {
- savePlot(address, "ResidualPlot", width, height)
- pb = utils::txtProgressBar(min = 0, max = length(fitted_models), style = 3)
- for (i in seq_along(fitted_models)) {
- fitted_model = fitted_models[[i]]
- if(is.null(fitted_model)) {
- next()
- }
- if (is(fitted_model, "lm")) {
- sub.residuals = fitted_model$residuals
- sub.fitted = fitted_model$fitted.values
- } else {
- sub.residuals = resid(fitted_model)
- sub.fitted = fitted(fitted_model)
- }
- sub.residuals.table = data.frame("residual" = sub.residuals,
- "fitted" = sub.fitted)
-
- plot = ggplot(aes_string(x = "fitted", y = "residual"),
- data = sub.residuals.table) +
- geom_point(size = dot.size,
- alpha = 0.5) +
- geom_hline(yintercept = 0,
- linetype = "twodash",
- colour = "darkgrey",
- size = 0.6) +
- scale_y_continuous("Residuals") +
- scale_x_continuous("Predicted Abundance") +
- labs(title = all_proteins[i]) +
- theme(
- panel.background = element_rect(fill = "white", colour = "black"),
- panel.grid.major = element_line(colour = "grey95"),
- panel.grid.minor = element_blank(),
- axis.text.x = element_text(size = axis.size, colour="black"),
- axis.text.y = element_text(size = axis.size, colour="black"),
- axis.ticks = element_line(colour = "black"),
- axis.title.x = element_text(size = axis.size + 5, vjust = -0.4),
- axis.title.y = element_text(size = axis.size + 5, vjust = 0.3),
- title = element_text(size = axis.size + 8, vjust = 1.5),
- legend.position = "none")
- print(plot)
- setTxtProgressBar(pb, i)
+.plotResiduals <- function(fitted_models, all_proteins, width, height,
+ address, dot.size, axis.size) {
+ savePlot(address, "ResidualPlot", width, height)
+ pb <- utils::txtProgressBar(min = 0, max = length(fitted_models), style = 3)
+ for (i in seq_along(fitted_models)) {
+ fitted_model <- fitted_models[[i]]
+ if (is.null(fitted_model)) {
+ next()
}
- close(pb)
- if (address != FALSE) {
- dev.off()
+ if (is(fitted_model, "lm")) {
+ sub.residuals <- fitted_model$residuals
+ sub.fitted <- fitted_model$fitted.values
+ } else {
+ sub.residuals <- resid(fitted_model)
+ sub.fitted <- fitted(fitted_model)
}
+ sub.residuals.table <- data.frame(
+ "residual" = sub.residuals,
+ "fitted" = sub.fitted
+ )
+
+ plot <- ggplot(aes_string(x = "fitted", y = "residual"),
+ data = sub.residuals.table
+ ) +
+ geom_point(
+ size = dot.size,
+ alpha = 0.5
+ ) +
+ geom_hline(
+ yintercept = 0,
+ linetype = "twodash",
+ colour = "darkgrey",
+ size = 0.6
+ ) +
+ scale_y_continuous("Residuals") +
+ scale_x_continuous("Predicted Abundance") +
+ labs(title = all_proteins[i]) +
+ theme(
+ panel.background = element_rect(fill = "white", colour = "black"),
+ panel.grid.major = element_line(colour = "grey95"),
+ panel.grid.minor = element_blank(),
+ axis.text.x = element_text(size = axis.size, colour = "black"),
+ axis.text.y = element_text(size = axis.size, colour = "black"),
+ axis.ticks = element_line(colour = "black"),
+ axis.title.x = element_text(size = axis.size + 5, vjust = -0.4),
+ axis.title.y = element_text(size = axis.size + 5, vjust = 0.3),
+ title = element_text(size = axis.size + 8, vjust = 1.5),
+ legend.position = "none"
+ )
+ print(plot)
+ setTxtProgressBar(pb, i)
+ }
+ close(pb)
+ if (address != FALSE) {
+ dev.off()
+ }
}
### new function which is a wrapper for modelBasedQCPlots
#' Visualization for model-based quality control in fitting model
-#'
-#' @description To check the assumption of linear model for whole plot inference,
-#' groupComparisonQCPlots takes the results after fitting models from function
-#' (\code{\link{groupComparison}}) as input and automatically generate two types
-#' of figures in pdf files as output:
+#'
+#' @description To check the assumption of linear model for whole plot inference,
+#' groupComparisonQCPlots takes the results after fitting models from function
+#' (\code{\link{groupComparison}}) as input and automatically generate two types
+#' of figures in pdf files as output:
#' (1) normal quantile-quantile plot (specify "QQPlot" in option type) for checking
-#' normally distributed errors.;
+#' normally distributed errors.;
#' (2) residual plot (specify "ResidualPlot" in option type).
-#'
+#'
#' @param data output from function groupComparison.
-#' @param type choice of visualization. "QQPlots" represents normal quantile-quantile
+#' @param type choice of visualization. "QQPlots" represents normal quantile-quantile
#' plot for each protein after fitting models. "ResidualPlots" represents a plot
#' of residuals versus fitted values for each protein in the dataset.
#' @param axis.size size of axes labels. Default is 10.
#' @param dot.size size of points in the graph for residual plots and QQ plots. Default is 3.
#' @param width width of the saved file. Default is 10.
#' @param height height of the saved file. Default is 10.
-#' @param which.Protein Protein list to draw plots. List can be names of Proteins
-#' or order numbers of Proteins from levels(testResultOneComparison$ComparisonResult$Protein).
+#' @param which.Protein Protein list to draw plots. List can be names of Proteins
+#' or order numbers of Proteins from levels(testResultOneComparison$ComparisonResult$Protein).
#' Default is "all", which generates all plots for each protein.
#' @param address name that will serve as a prefix to the name of output file.
-#'
-#' @details Results based on statistical models for whole plot level inference are
-#' accurate as long as the assumptions of the model are met. The model assumes that
-#' the measurement errors are normally distributed with mean 0 and constant variance.
+#'
+#' @details Results based on statistical models for whole plot level inference are
+#' accurate as long as the assumptions of the model are met. The model assumes that
+#' the measurement errors are normally distributed with mean 0 and constant variance.
#' The assumption of a constant variance can be checked by examining the residuals from the model.
#' \itemize{
#' \item{QQPlots : a normal quantile-quantile plot for each protein is generated in order to check whether the errors are well approximated by a normal distribution. If points fall approximately along a straight line, then the assumption is appropriate for that protein. Only large deviations from the line are problematic.}
#' \item{ResidualPlots : The plots of residuals against predicted(fitted) values. If it shows a random scatter, then the assumption is appropriate.}
#' }
-#'
+#'
#' @return produce a pdf file
-#'
+#'
#' @export
-#'
+#'
#' @examples
#' QuantData <- dataProcess(SRMRawData, use_log_file = FALSE)
#' head(QuantData$FeatureLevelData)
#' levels(QuantData$FeatureLevelData$GROUP)
-#' comparison <- matrix(c(-1,0,0,0,0,0,1,0,0,0),nrow=1)
+#' comparison <- matrix(c(-1, 0, 0, 0, 0, 0, 1, 0, 0, 0), nrow = 1)
#' row.names(comparison) <- "T7-T1"
#' colnames(comparison) <- unique(QuantData$ProteinLevelData$GROUP)
#' # Tests for differentially abundant proteins with models:
#' # label-based SRM experiment with expanded scope of biological replication.
-#' testResultOneComparison <- groupComparison(contrast.matrix=comparison, data=QuantData,
-#' use_log_file = FALSE)
+#' testResultOneComparison <- groupComparison(
+#' contrast.matrix = comparison, data = QuantData,
+#' use_log_file = FALSE
+#' )
#' # normal quantile-quantile plots
-#' groupComparisonQCPlots(data=testResultOneComparison, type="QQPlots", address="")
+#' groupComparisonQCPlots(data = testResultOneComparison, type = "QQPlots", address = "")
#' # residual plots
-#' groupComparisonQCPlots(data=testResultOneComparison, type="ResidualPlots", address="")
-#'
-
-groupComparisonQCPlots = function(data, type, axis.size = 10, dot.size = 3, width = 10, height = 10,
- which.Protein = "all", address = ""){
-
- modelBasedQCPlots(data, type, axis.size, dot.size, width, height,
- which.Protein, address, FALSE)
+#' groupComparisonQCPlots(data = testResultOneComparison, type = "ResidualPlots", address = "")
+#'
+groupComparisonQCPlots <- function(
+ data, type, axis.size = 10, dot.size = 3, width = 10, height = 10,
+ which.Protein = "all", address = "") {
+ modelBasedQCPlots(
+ data, type, axis.size, dot.size, width, height,
+ which.Protein, address, FALSE
+ )
}
diff --git a/R/msstats-package.R b/R/msstats-package.R
index 40920795..89441d25 100644
--- a/R/msstats-package.R
+++ b/R/msstats-package.R
@@ -10,6 +10,6 @@ NULL
#' @param ... ignored
#' @return none, sets options called MSstatsLog and MSstatsMsg
#' @keywords internal
-.onLoad = function(...) {
- MSstatsConvert::MSstatsLogsSettings()
+.onLoad <- function(...) {
+ MSstatsConvert::MSstatsLogsSettings()
}
diff --git a/R/quantification.R b/R/quantification.R
index b3f27573..91ebd371 100644
--- a/R/quantification.R
+++ b/R/quantification.R
@@ -40,78 +40,88 @@
#' # Sample quantification shows model-based estimation of protein abundance in each biological
#' # replicate within each time point.
#' # Group quantification shows model-based estimation of protein abundance in each time point.
-#' QuantData<-dataProcess(SRMRawData, use_log_file = FALSE)
+#' QuantData <- dataProcess(SRMRawData, use_log_file = FALSE)
#' head(QuantData$FeatureLevelData)
#' # Sample quantification
-#' sampleQuant<-quantification(QuantData, use_log_file = FALSE)
+#' sampleQuant <- quantification(QuantData, use_log_file = FALSE)
#' head(sampleQuant)
#' # Group quantification
-#' groupQuant<-quantification(QuantData, type="Group", use_log_file = FALSE)
+#' groupQuant <- quantification(QuantData, type = "Group", use_log_file = FALSE)
#' head(groupQuant)
#'
-quantification = function(
- data, type = "Sample", format="matrix", use_log_file = TRUE, append = FALSE,
- verbose = TRUE, log_file_path = NULL
-) {
- LogIntensities = NULL
-
- MSstatsConvert::MSstatsLogsSettings(use_log_file, append, verbose,
- log_file_path, base = "MSstats_quant_log_")
- getOption("MSstatsLog")("INFO", "MSstats - quantification function")
- checkmate::assertChoice(toupper(type), c("SAMPLE", "GROUP"),
- .var.name = "type")
- checkmate::assertChoice(toupper(format), c("MATRIX", "LONG"),
- .var.name = "format")
- getOption("MSstatsLog")("INFO", paste0("type of quantification = ", type))
- getOption("MSstatsLog")("INFO", paste0("format of output = ", format))
+quantification <- function(
+ data, type = "Sample", format = "matrix", use_log_file = TRUE, append = FALSE,
+ verbose = TRUE, log_file_path = NULL) {
+ LogIntensities <- NULL
- if (toupper(type) == "SAMPLE") {
- datarun = data.table::as.data.table(data$ProteinLevelData)
- datarun$Protein = factor(datarun$Protein)
- datarun = datarun[!is.na(LogIntensities), ]
- datam = data.table::dcast(Protein ~ GROUP + SUBJECT,
- data = datarun,
- value.var = 'LogIntensities',
- fun.aggregate = median)
- if (format == "long") {
- data_l = melt(datam, id.vars=c('Protein'))
- colnames(data_l)[colnames(data_l) %in% c("variable", "value")] = c('Group_Subject', 'LogIntensity')
- }
- getOption("MSstatsLog")("INFO", "Finish sample quantificiation - okay.")
- if (format == "long") {
- return(data_l)
- }
- if (format == "matrix") {
- return(datam)
- }
+ MSstatsConvert::MSstatsLogsSettings(use_log_file, append, verbose,
+ log_file_path,
+ base = "MSstats_quant_log_"
+ )
+ getOption("MSstatsLog")("INFO", "MSstats - quantification function")
+ checkmate::assertChoice(toupper(type), c("SAMPLE", "GROUP"),
+ .var.name = "type"
+ )
+ checkmate::assertChoice(toupper(format), c("MATRIX", "LONG"),
+ .var.name = "format"
+ )
+ getOption("MSstatsLog")("INFO", paste0("type of quantification = ", type))
+ getOption("MSstatsLog")("INFO", paste0("format of output = ", format))
+
+ if (toupper(type) == "SAMPLE") {
+ datarun <- data.table::as.data.table(data$ProteinLevelData)
+ datarun$Protein <- factor(datarun$Protein)
+ datarun <- datarun[!is.na(LogIntensities), ]
+ datam <- data.table::dcast(Protein ~ GROUP + SUBJECT,
+ data = datarun,
+ value.var = "LogIntensities",
+ fun.aggregate = median
+ )
+ if (format == "long") {
+ data_l <- melt(datam, id.vars = c("Protein"))
+ colnames(data_l)[colnames(data_l) %in% c("variable", "value")] <- c("Group_Subject", "LogIntensity")
+ }
+ getOption("MSstatsLog")("INFO", "Finish sample quantificiation - okay.")
+ if (format == "long") {
+ return(data_l)
+ }
+ if (format == "matrix") {
+ return(datam)
}
+ }
- if (toupper(type) == "GROUP") {
- datarun = data.table::as.data.table(data$ProteinLevelData)
- datarun$Protein = factor(datarun$Protein)
- datarun = datarun[!is.na(LogIntensities), ]
- datam = data.table::dcast(Protein + GROUP ~ SUBJECT,
- data = datarun,
- value.var = 'LogIntensities',
- fun.aggregate = median)
- datam2 = data.table::melt(datam, id.vars = c('Protein', "GROUP"))
- data.table::setnames(datam2, c("variable", "value"),
- c("Subject", "LogIntensity"))
- datam3 = data.table::dcast(Protein ~ GROUP,
- data = datam2,
- value.var = 'LogIntensity',
- fun.aggregate = function(x) median(x, na.rm=TRUE))
- if (format == "long") {
- data_l = melt(datam3, id.vars=c('Protein'))
- data.table::setnames(data_l, c("variable", "value"),
- c("Group", "LogIntensity"))
- }
- getOption("MSstatsLog")("INFO", "Finish group quantificiation - okay.")
- if (format == "long") {
- return(data_l)
- }
- if (format == "matrix") {
- return(datam3)
- }
+ if (toupper(type) == "GROUP") {
+ datarun <- data.table::as.data.table(data$ProteinLevelData)
+ datarun$Protein <- factor(datarun$Protein)
+ datarun <- datarun[!is.na(LogIntensities), ]
+ datam <- data.table::dcast(Protein + GROUP ~ SUBJECT,
+ data = datarun,
+ value.var = "LogIntensities",
+ fun.aggregate = median
+ )
+ datam2 <- data.table::melt(datam, id.vars = c("Protein", "GROUP"))
+ data.table::setnames(
+ datam2, c("variable", "value"),
+ c("Subject", "LogIntensity")
+ )
+ datam3 <- data.table::dcast(Protein ~ GROUP,
+ data = datam2,
+ value.var = "LogIntensity",
+ fun.aggregate = function(x) median(x, na.rm = TRUE)
+ )
+ if (format == "long") {
+ data_l <- melt(datam3, id.vars = c("Protein"))
+ data.table::setnames(
+ data_l, c("variable", "value"),
+ c("Group", "LogIntensity")
+ )
+ }
+ getOption("MSstatsLog")("INFO", "Finish group quantificiation - okay.")
+ if (format == "long") {
+ return(data_l)
+ }
+ if (format == "matrix") {
+ return(datam3)
}
+ }
}
diff --git a/R/utils_censored.R b/R/utils_censored.R
index 338cfaf2..f3cac5da 100644
--- a/R/utils_censored.R
+++ b/R/utils_censored.R
@@ -1,123 +1,136 @@
-#' Handle censored missing values
-#'
+#' Handle censored missing values
+#'
#' @param input `data.table` in MSstats data format
#' @param summary_method summarization method (`summaryMethod` parameter to `dataProcess`)
-#' @param impute if TRUE, missing values are supposed to be imputed
+#' @param impute if TRUE, missing values are supposed to be imputed
#' (`MBimpute` parameter to `dataProcess`)
#' @param missing_symbol `censoredInt` parameter to `dataProcess`
#' @param censored_cutoff `maxQuantileforCensored` parameter to `dataProcess`
-#'
+#'
#' @importFrom stats quantile
-#'
+#'
#' @export
-#'
+#'
#' @return data.table
-#'
+#'
#' @examples
-#' raw = DDARawData
-#' method = "TMP"
-#' cens = "NA"
-#' impute = TRUE
+#' raw <- DDARawData
+#' method <- "TMP"
+#' cens <- "NA"
+#' impute <- TRUE
#' MSstatsConvert::MSstatsLogsSettings(FALSE)
-#' input = MSstatsPrepareForDataProcess(raw, 2, NULL)
-#' input = MSstatsNormalize(input, "EQUALIZEMEDIANS")
-#' input = MSstatsMergeFractions(input)
-#' input = MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
+#' input <- MSstatsPrepareForDataProcess(raw, 2, NULL)
+#' input <- MSstatsNormalize(input, "EQUALIZEMEDIANS")
+#' input <- MSstatsMergeFractions(input)
+#' input <- MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
#' head(input)
-#'
-MSstatsHandleMissing = function(input, summary_method, impute,
- missing_symbol, censored_cutoff) {
- INTENSITY = LABEL = ABUNDANCE = censored = NULL
-
- if ((summary_method == "TMP" & impute) & !is.null(missing_symbol)) {
- input$censored = FALSE
- ## if intensity = 1, but abundance > cutoff after normalization, it also should be censored.
- if (!is.null(censored_cutoff)) {
- quantiles = input[!is.na(INTENSITY) & INTENSITY > 1 & LABEL == "L",
- quantile(ABUNDANCE,
- prob = c(0.01, 0.25, 0.5, 0.75,
- censored_cutoff),
- na.rm = TRUE)]
- iqr = quantiles[4] - quantiles[2]
- multiplier = (quantiles[5] - quantiles[4]) / iqr
- cutoff_lower = (quantiles[2] - multiplier * iqr)
- input$censored = !is.na(input$INTENSITY) &
- input$LABEL == "L" &
- input$ABUNDANCE < cutoff_lower
- if (cutoff_lower <= 0 & !is.null(missing_symbol) & missing_symbol == "0") {
- zero_one_filter = !is.na(input$ABUNDANCE) & input$ABUNDANCE <= 0
- input$censored = ifelse(zero_one_filter, TRUE, input$censored)
- }
- if (!is.null(missing_symbol) & missing_symbol == "NA") {
- input$censored = ifelse(is.na(input$INTENSITY), TRUE,
- input$censored)
- }
-
- msg = paste('** Log2 intensities under cutoff =',
- format(cutoff_lower, digits = 5),
- ' were considered as censored missing values.')
- msg_2 = paste("** Log2 intensities =", missing_symbol, "were considered as censored missing values.")
-
- getOption("MSstatsMsg")("INFO", msg)
- getOption("MSstatsMsg")("INFO", msg_2)
-
- getOption("MSstatsLog")("INFO", msg)
- getOption("MSstatsLog")("INFO", msg_2)
-
- } else {
- if (missing_symbol == '0') {
- input$censored = input$LABEL == "L" &
- !is.na(input$INTENSITY) &
- (input$INTENSITY == 1 | input$ABUNDANCE <= 0)
- } else if (missing_symbol == 'NA') {
- input$censored = input$LABEL == "L" & is.na(input$ABUNDANCE)
- }
- }
- input[, censored := ifelse(LABEL == "H", FALSE, censored)]
+#'
+MSstatsHandleMissing <- function(input, summary_method, impute,
+ missing_symbol, censored_cutoff) {
+ INTENSITY <- LABEL <- ABUNDANCE <- censored <- NULL
+
+ if ((summary_method == "TMP" & impute) & !is.null(missing_symbol)) {
+ input$censored <- FALSE
+ ## if intensity = 1, but abundance > cutoff after normalization, it also should be censored.
+ if (!is.null(censored_cutoff)) {
+ quantiles <- input[
+ !is.na(INTENSITY) & INTENSITY > 1 & LABEL == "L",
+ quantile(ABUNDANCE,
+ prob = c(
+ 0.01, 0.25, 0.5, 0.75,
+ censored_cutoff
+ ),
+ na.rm = TRUE
+ )
+ ]
+ iqr <- quantiles[4] - quantiles[2]
+ multiplier <- (quantiles[5] - quantiles[4]) / iqr
+ cutoff_lower <- (quantiles[2] - multiplier * iqr)
+ input$censored <- !is.na(input$INTENSITY) &
+ input$LABEL == "L" &
+ input$ABUNDANCE < cutoff_lower
+ if (cutoff_lower <= 0 & !is.null(missing_symbol) & missing_symbol == "0") {
+ zero_one_filter <- !is.na(input$ABUNDANCE) & input$ABUNDANCE <= 0
+ input$censored <- ifelse(zero_one_filter, TRUE, input$censored)
+ }
+ if (!is.null(missing_symbol) & missing_symbol == "NA") {
+ input$censored <- ifelse(is.na(input$INTENSITY), TRUE,
+ input$censored
+ )
+ }
+
+ msg <- paste(
+ "** Log2 intensities under cutoff =",
+ format(cutoff_lower, digits = 5),
+ " were considered as censored missing values."
+ )
+ msg_2 <- paste("** Log2 intensities =", missing_symbol, "were considered as censored missing values.")
+
+ getOption("MSstatsMsg")("INFO", msg)
+ getOption("MSstatsMsg")("INFO", msg_2)
+
+ getOption("MSstatsLog")("INFO", msg)
+ getOption("MSstatsLog")("INFO", msg_2)
} else {
- input$censored = FALSE
+ if (missing_symbol == "0") {
+ input$censored <- input$LABEL == "L" &
+ !is.na(input$INTENSITY) &
+ (input$INTENSITY == 1 | input$ABUNDANCE <= 0)
+ } else if (missing_symbol == "NA") {
+ input$censored <- input$LABEL == "L" & is.na(input$ABUNDANCE)
+ }
}
- input
+ input[, censored := ifelse(LABEL == "H", FALSE, censored)]
+ } else {
+ input$censored <- FALSE
+ }
+ input
}
#' Set censored values based on minimum in run/feature/run or feature.
-#' This is used to initialize the AFT imputation model by supplying the maximum
+#' This is used to initialize the AFT imputation model by supplying the maximum
#' possible values for left-censored data as the `time` input to the Surv function.
#' @param input `data.table` in MSstats format
#' @param censored_symbol censoredInt parameter to `dataProcess`
#' @param remove50missing if TRUE, features with at least 50% missing values
#' will be removed
#' @keywords internal
-.setCensoredByThreshold = function(input, censored_symbol, remove50missing) {
- total_features = n_obs = newABUNDANCE = n_obs_run = censored = NULL
- nonmissing_all = ABUNDANCE_cut = NULL
-
+.setCensoredByThreshold <- function(input, censored_symbol, remove50missing) {
+ total_features <- n_obs <- newABUNDANCE <- n_obs_run <- censored <- NULL
+ nonmissing_all <- ABUNDANCE_cut <- NULL
+
if (censored_symbol == "NA") {
input[, nonmissing_all := !is.na(newABUNDANCE)]
} else if (censored_symbol == "0") {
input[, nonmissing_all := !is.na(newABUNDANCE) & input$newABUNDANCE != 0]
}
-
- input[, nonmissing_all := ifelse(total_features > 1 & n_obs <= 1,
- FALSE, nonmissing_all)]
- valid_observations = input[n_obs > 1 & n_obs_run > 0 & nonmissing_all,
- .(PROTEIN, FEATURE, LABEL, newABUNDANCE)]
- min_abundance_by_group = valid_observations[, .(
- min_abundance = min(newABUNDANCE, na.rm = TRUE)
+
+ input[, nonmissing_all := ifelse(total_features > 1 & n_obs <= 1,
+ FALSE, nonmissing_all
+ )]
+ valid_observations <- input[
+ n_obs > 1 & n_obs_run > 0 & nonmissing_all,
+ .(PROTEIN, FEATURE, LABEL, newABUNDANCE)
+ ]
+ min_abundance_by_group <- valid_observations[, .(
+ min_abundance = min(newABUNDANCE, na.rm = TRUE)
), by = .(PROTEIN, FEATURE, LABEL)]
min_abundance_by_group[, abundance_cutoff := 0.99 * min_abundance]
input[min_abundance_by_group, ABUNDANCE_cut := ifelse(
- n_obs > 1 & n_obs_run > 0, abundance_cutoff, NA
- ), on = c("PROTEIN", "FEATURE", "LABEL")]
+ n_obs > 1 & n_obs_run > 0, abundance_cutoff, NA
+ ), on = c("PROTEIN", "FEATURE", "LABEL")]
input[, any_censored := any(censored & n_obs > 1 & n_obs_run > 0),
- by = "PROTEIN"]
+ by = "PROTEIN"
+ ]
if (censored_symbol == "NA") {
- input[, newABUNDANCE := ifelse(!nonmissing_all & censored & is.finite(ABUNDANCE_cut) & any_censored,
- ABUNDANCE_cut, newABUNDANCE)]
+ input[, newABUNDANCE := ifelse(!nonmissing_all & censored & is.finite(ABUNDANCE_cut) & any_censored,
+ ABUNDANCE_cut, newABUNDANCE
+ )]
} else if (censored_symbol == "0") {
- input[, newABUNDANCE := ifelse(!nonmissing_all & newABUNDANCE == 0 & is.finite(ABUNDANCE_cut) & any_censored,
- ABUNDANCE_cut, newABUNDANCE)]
+ input[, newABUNDANCE := ifelse(!nonmissing_all & newABUNDANCE == 0 & is.finite(ABUNDANCE_cut) & any_censored,
+ ABUNDANCE_cut, newABUNDANCE
+ )]
}
}
@@ -127,17 +140,17 @@ MSstatsHandleMissing = function(input, summary_method, impute,
#' @param impute if TRUE, missing values are supposed to be imputed
#' @param censored_symbol `censoredInt` parameter to dataProcess
#' @keywords internal
-.getNonMissingFilter = function(input, impute, censored_symbol) {
- if (impute) {
- if (!is.null(censored_symbol)) {
- if (censored_symbol == "0") {
- nonmissing_filter = input$LABEL == "L" & !is.na(input$newABUNDANCE) & input$newABUNDANCE != 0
- } else if (censored_symbol == "NA") {
- nonmissing_filter = input$LABEL == "L" & !is.na(input$newABUNDANCE)
- }
- }
- } else {
- nonmissing_filter = input$LABEL == "L" & !is.na(input$newABUNDANCE) & input$newABUNDANCE != 0
+.getNonMissingFilter <- function(input, impute, censored_symbol) {
+ if (impute) {
+ if (!is.null(censored_symbol)) {
+ if (censored_symbol == "0") {
+ nonmissing_filter <- input$LABEL == "L" & !is.na(input$newABUNDANCE) & input$newABUNDANCE != 0
+ } else if (censored_symbol == "NA") {
+ nonmissing_filter <- input$LABEL == "L" & !is.na(input$newABUNDANCE)
+ }
}
- nonmissing_filter
+ } else {
+ nonmissing_filter <- input$LABEL == "L" & !is.na(input$newABUNDANCE) & input$newABUNDANCE != 0
+ }
+ nonmissing_filter
}
diff --git a/R/utils_checks.R b/R/utils_checks.R
index adfe09bb..905eb7bc 100644
--- a/R/utils_checks.R
+++ b/R/utils_checks.R
@@ -1,37 +1,41 @@
#' Check if annotation matches intended experimental design
-#'
+#'
#' @param msstats_table output of a converter function
#' @param design_type character, "group comparison" or "repeated measures"
-#'
+#'
#' @importFrom data.table as.data.table uniqueN
-#'
+#'
#' @return TRUE if annotation file is consistent with intended experimental design. Otherwise, an error is thrown
#' @export
-#'
-validateAnnotation = function(msstats_table, design_type = "group comparison") {
- annotation = unique(data.table::as.data.table(msstats_table)[, list(BioReplicate, Condition)])
- if (data.table::uniqueN(annotation[["Condition"]]) == 1) {
+#'
+validateAnnotation <- function(msstats_table, design_type = "group comparison") {
+ annotation <- unique(data.table::as.data.table(msstats_table)[, list(BioReplicate, Condition)])
+ if (data.table::uniqueN(annotation[["Condition"]]) == 1) {
stop("MSstats performs relative protein quantification, which requires more than one Condition. Please check the annotation file.")
}
-
- num_conditions_per_biorep = annotation[, list(NumConditions = data.table::uniqueN(Condition)), by = "BioReplicate"]
- num_bioreps_per_condition = annotation[, list(NumBioReps = data.table::uniqueN(BioReplicate)), by = "Condition"]
+
+ num_conditions_per_biorep <- annotation[, list(NumConditions = data.table::uniqueN(Condition)), by = "BioReplicate"]
+ num_bioreps_per_condition <- annotation[, list(NumBioReps = data.table::uniqueN(BioReplicate)), by = "Condition"]
if (design_type == "group comparison") {
- if (all(num_conditions_per_biorep[["NumConditions"]] == 1L)) {
- return(TRUE)
- } else {
- stop(paste("In group comparison design, each biological replicate should be assigned to a single condition.\n",
- "Currently, some biological replicates match to multiple conditions"))
- }
- if (all(num_bioreps_per_condition[["NumBioReps"]] == 1L)) {
+ if (all(num_conditions_per_biorep[["NumConditions"]] == 1L)) {
+ return(TRUE)
+ } else {
+ stop(paste(
+ "In group comparison design, each biological replicate should be assigned to a single condition.\n",
+ "Currently, some biological replicates match to multiple conditions"
+ ))
+ }
+ if (all(num_bioreps_per_condition[["NumBioReps"]] == 1L)) {
message("Each condition only consists of a single biological replicate. Hypothesis testing for differential abundance will not be possible.")
}
} else if (design_type == "repeated measures") {
if (all(num_conditions_per_biorep[["NumConditions"]] > 1)) {
return(TRUE)
} else {
- stop(paste("In repeated measures design, biological replicates should be measured across multiple conditions.\n",
- "Currently, each biological replicate matches to a different condition."))
+ stop(paste(
+ "In repeated measures design, biological replicates should be measured across multiple conditions.\n",
+ "Currently, each biological replicate matches to a different condition."
+ ))
}
} else {
stop("Unrecognized design type. Accepted values are 'group comparison' or 'repeated measures'")
@@ -39,44 +43,46 @@ validateAnnotation = function(msstats_table, design_type = "group comparison") {
}
#' Prepare data for processing by `dataProcess` function
-#'
+#'
#' @param input `data.table` in MSstats format
#' @param log_base base of the logarithm to transform intensities
#' @inheritParams MSstatsConvert::MSstatsBalancedDesign
#' @export
-#'
+#'
#' @return data.table
-#'
-#' @examples
-#' raw = DDARawData
-#' method = "TMP"
-#' cens = "NA"
-#' impute = TRUE
+#'
+#' @examples
+#' raw <- DDARawData
+#' method <- "TMP"
+#' cens <- "NA"
+#' impute <- TRUE
#' MSstatsConvert::MSstatsLogsSettings(FALSE)
-#' input = MSstatsPrepareForDataProcess(raw, 2, NULL)
+#' input <- MSstatsPrepareForDataProcess(raw, 2, NULL)
#' head(input)
-#'
-MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) {
- input = .checkDataValidity(input, fix_missing = fix_missing)
- input = .updateColumnsForProcessing(input)
- .preProcessIntensities(input, log_base)
- input = .makeFactorColumns(input)
- input
+#'
+MSstatsPrepareForDataProcess <- function(input, log_base, fix_missing) {
+ input <- .checkDataValidity(input, fix_missing = fix_missing)
+ input <- .updateColumnsForProcessing(input)
+ .preProcessIntensities(input, log_base)
+ input <- .makeFactorColumns(input)
+ input
}
#' Save information about R session to sessionInfo.txt file.
#' @importFrom utils sessionInfo
#' @keywords internal
-.saveSessionInfo = function() {
- file_name = paste0("msstats_sessionInfo_",
- gsub(" " , "T", as.character(Sys.time())),
- ".txt")
- file_name = gsub(":", "_", file_name, fixed = TRUE)
- session_info = utils::sessionInfo()
- sink(file_name)
- print(session_info)
- sink()
+.saveSessionInfo <- function() {
+ file_name <- paste0(
+ "msstats_sessionInfo_",
+ gsub(" ", "T", as.character(Sys.time())),
+ ".txt"
+ )
+ file_name <- gsub(":", "_", file_name, fixed = TRUE)
+ session_info <- utils::sessionInfo()
+ sink(file_name)
+ print(session_info)
+ sink()
}
#' Check validity of parameters to dataProcess function
@@ -87,36 +93,45 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) {
#' @param summarization list with elements: method.
#' @param imputation list with elements: cutoff, symbol.
#' @keywords internal
-.checkDataProcessParams = function(log_base, normalization_method,
- standards_names, feature_selection,
- summarization, imputation) {
- checkmate::assertChoice(log_base, c(2, 10), .var.name = "logTrans")
- checkmate::assertChoice(summarization$method, c("linear", "TMP"),
- .var.name = "summaryMethod")
- getOption("MSstatsLog")("INFO", paste("Summary method:",
- summarization$method))
- checkmate::assertChoice(imputation$symbol, c("0", "NA"),
- null.ok = TRUE, .var.name = "censoredInt")
- getOption("MSstatsLog")("INFO", paste("censoredInt:", imputation$symbol))
- if (summarization$method == "TMP" & imputation$MB & is.null(imputation$symbol)) {
- msg = paste("The combination of required input",
- "MBimpute == TRUE and censoredInt = NULL",
- "has no censore missing values.",
- "Imputation will not be performed.- stop")
- getOption("MSstatsLog")("ERROR", msg)
- stop(msg)
- }
- checkmate::assertChoice(toupper(as.character(normalization_method)),
- c("NONE", "FALSE", "EQUALIZEMEDIANS", "QUANTILE", "GLOBALSTANDARDS"),
- .var.name = "normalization")
- if (toupper(as.character(normalization_method)) == "GLOBALSTANDARDS" &
- is.null(standards_names)) {
- msg = paste("For normalization with global standards,",
- "the names of global standards are needed.",
- "Please add 'nameStandards' input.")
- getOption("MSstatsLog")("ERROR", msg)
- stop(msg)
- }
+.checkDataProcessParams <- function(log_base, normalization_method,
+ standards_names, feature_selection,
+ summarization, imputation) {
+ checkmate::assertChoice(log_base, c(2, 10), .var.name = "logTrans")
+ checkmate::assertChoice(summarization$method, c("linear", "TMP"),
+ .var.name = "summaryMethod"
+ )
+ getOption("MSstatsLog")("INFO", paste(
+ "Summary method:",
+ summarization$method
+ ))
+ checkmate::assertChoice(imputation$symbol, c("0", "NA"),
+ null.ok = TRUE, .var.name = "censoredInt"
+ )
+ getOption("MSstatsLog")("INFO", paste("censoredInt:", imputation$symbol))
+ if (summarization$method == "TMP" & imputation$MB & is.null(imputation$symbol)) {
+ msg <- paste(
+ "The combination of required input",
+ "MBimpute == TRUE and censoredInt = NULL",
+ "has no censore missing values.",
+ "Imputation will not be performed.- stop"
+ )
+ getOption("MSstatsLog")("ERROR", msg)
+ stop(msg)
+ }
+ checkmate::assertChoice(toupper(as.character(normalization_method)),
+ c("NONE", "FALSE", "EQUALIZEMEDIANS", "QUANTILE", "GLOBALSTANDARDS"),
+ .var.name = "normalization"
+ )
+ if (toupper(as.character(normalization_method)) == "GLOBALSTANDARDS" &
+ is.null(standards_names)) {
+ msg <- paste(
+ "For normalization with global standards,",
+ "the names of global standards are needed.",
+ "Please add 'nameStandards' input."
+ )
+ getOption("MSstatsLog")("ERROR", msg)
+ stop(msg)
+ }
}
@@ -124,13 +139,15 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) {
#' @param input data.table
#' @param column_name chr, name of a column to check
#' @keywords internal
-.checkExperimentDesign = function(input, column_name) {
- if (any(is.na(input[[column_name]]))) {
- msg = paste("Missing information in the", column_name, "column. ",
- "Please check the", column_name, "column.", collapse = " ")
- getOption("MSstatsLog")("ERROR", msg)
- stop(msg)
- }
+.checkExperimentDesign <- function(input, column_name) {
+ if (any(is.na(input[[column_name]]))) {
+ msg <- paste("Missing information in the", column_name, "column. ",
+ "Please check the", column_name, "column.",
+ collapse = " "
+ )
+ getOption("MSstatsLog")("ERROR", msg)
+ stop(msg)
+ }
}
@@ -139,62 +156,74 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) {
#' @inheritParams MSstatsPrepareForDataProcess
#' @importFrom data.table uniqueN as.data.table
#' @keywords internal
-.checkUnProcessedDataValidity = function(input, fix_missing, fill_incomplete) {
- input = data.table::as.data.table(unclass(input))
- cols = c("ProteinName", "PeptideSequence", "PeptideModifiedSequence",
- "PrecursorCharge", "FragmentIon", "ProductCharge",
- "IsotopeLabelType", "Condition", "BioReplicate", "Run", "Intensity")
- provided_cols = intersect(cols, colnames(input))
-
- if (length(provided_cols) < 10) {
- msg = paste("Missing columns in the input:",
- paste(setdiff(cols, colnames(input)), collapse = " "))
- getOption("MSstatsLog")("ERROR", msg)
- stop(msg)
- }
- data.table::setnames(input, "PeptideModifiedSequence", "PeptideSequence",
- skip_absent = TRUE)
-
- balanced_cols = c("PeptideSequence", "PrecursorCharge",
- "FragmentIon", "ProductCharge")
- input = MSstatsConvert::MSstatsBalancedDesign(
- input, balanced_cols, TRUE, TRUE, fix_missing)
- input = data.table::as.data.table(unclass(input))
- data.table::setnames(input, colnames(input), toupper(colnames(input)))
-
-
- if (!is.numeric(input$INTENSITY)) {
- suppressWarnings({
- input$INTENSITY = as.numeric(as.character(input$INTENSITY))
- })
- }
-
- .checkExperimentDesign(input, "RUN")
- .checkExperimentDesign(input, "BIOREPLICATE")
- .checkExperimentDesign(input, "CONDITION")
-
- cols = toupper(cols)
- cols = intersect(c(cols, "FRACTION", "TECHREPLICATE"),
- colnames(input))
- input = input[, cols, with = FALSE]
-
- input$PEPTIDE = paste(input$PEPTIDESEQUENCE, input$PRECURSORCHARGE, sep = "_")
- input$TRANSITION = paste(input$FRAGMENTION, input$PRODUCTCHARGE, sep = "_")
-
- if (data.table::uniqueN(input$ISOTOPELABELTYPE) > 2) {
- getOption("MSstatsLog")("ERROR",
- paste("There are more than two levels of labeling.",
- "So far, only label-free or reference-labeled experiment are supported. - stop"))
- stop("Statistical tools in MSstats are only proper for label-free or with reference peptide experiments.")
- }
-
- input$ISOTOPELABELTYPE = factor(input$ISOTOPELABELTYPE)
- if (data.table::uniqueN(input$ISOTOPELABELTYPE) == 2) {
- levels(input$ISOTOPELABELTYPE) = c("H", "L")
- } else {
- levels(input$ISOTOPELABELTYPE) = "L"
- }
- input
+.checkUnProcessedDataValidity <- function(input, fix_missing, fill_incomplete) {
+ input <- data.table::as.data.table(unclass(input))
+ cols <- c(
+ "ProteinName", "PeptideSequence", "PeptideModifiedSequence",
+ "PrecursorCharge", "FragmentIon", "ProductCharge",
+ "IsotopeLabelType", "Condition", "BioReplicate", "Run", "Intensity"
+ )
+ provided_cols <- intersect(cols, colnames(input))
+
+ if (length(provided_cols) < 10) {
+ msg <- paste(
+ "Missing columns in the input:",
+ paste(setdiff(cols, colnames(input)), collapse = " ")
+ )
+ getOption("MSstatsLog")("ERROR", msg)
+ stop(msg)
+ }
+ data.table::setnames(input, "PeptideModifiedSequence", "PeptideSequence",
+ skip_absent = TRUE
+ )
+
+ balanced_cols <- c(
+ "PeptideSequence", "PrecursorCharge",
+ "FragmentIon", "ProductCharge"
+ )
+ input <- MSstatsConvert::MSstatsBalancedDesign(
+ input, balanced_cols, TRUE, TRUE, fix_missing
+ )
+ input <- data.table::as.data.table(unclass(input))
+ data.table::setnames(input, colnames(input), toupper(colnames(input)))
+
+
+ if (!is.numeric(input$INTENSITY)) {
+ suppressWarnings({
+ input$INTENSITY <- as.numeric(as.character(input$INTENSITY))
+ })
+ }
+
+ .checkExperimentDesign(input, "RUN")
+ .checkExperimentDesign(input, "BIOREPLICATE")
+ .checkExperimentDesign(input, "CONDITION")
+
+ cols <- toupper(cols)
+ cols <- intersect(
+ c(cols, "FRACTION", "TECHREPLICATE"),
+ colnames(input)
+ )
+ input <- input[, cols, with = FALSE]
+
+ input$PEPTIDE <- paste(input$PEPTIDESEQUENCE, input$PRECURSORCHARGE, sep = "_")
+ input$TRANSITION <- paste(input$FRAGMENTION, input$PRODUCTCHARGE, sep = "_")
+
+ if (data.table::uniqueN(input$ISOTOPELABELTYPE) > 2) {
+ getOption("MSstatsLog")("ERROR",
+ paste(
+ "There are more than two levels of labeling.",
+ "So far, only label-free or reference-labeled experiment are supported. - stop"
+ ))
+ stop("Statistical tools in MSstats are only proper for label-free or with reference peptide experiments.")
+ }
+
+ input$ISOTOPELABELTYPE <- factor(input$ISOTOPELABELTYPE)
+ if (data.table::uniqueN(input$ISOTOPELABELTYPE) == 2) {
+ levels(input$ISOTOPELABELTYPE) <- c("H", "L")
+ } else {
+ levels(input$ISOTOPELABELTYPE) <- "L"
+ }
+ input
}
@@ -202,26 +231,29 @@ MSstatsPrepareForDataProcess = function(input, log_base, fix_missing) {
#' @param input data.frame of class `MSstatsValidated`
#' @param .. additional parameters, currently ignored
#' @keywords internal
-.prepareForDataProcess = function(input, ...) {
- input = as.data.table(unclass(input))
- colnames(input) = toupper(colnames(input))
- if (is.element("PEPTIDEMODIFIEDSEQUENCE", colnames(input))) {
- data.table::setnames(
- input, "PEPTIDEMODIFIEDSEQUENCE", "PEPTIDESEQUENCE")
- }
- input$PEPTIDE = paste(input$PEPTIDESEQUENCE, input$PRECURSORCHARGE, sep = "_")
- input$TRANSITION = paste(input$FRAGMENTION, input$PRODUCTCHARGE, sep = "_")
- input$ISOTOPELABELTYPE = factor(input$ISOTOPELABELTYPE)
- if (data.table::uniqueN(input$ISOTOPELABELTYPE) == 2) {
- levels(input$ISOTOPELABELTYPE) = c("H", "L")
- } else {
- levels(input$ISOTOPELABELTYPE) = "L"
- }
- input
+.prepareForDataProcess <- function(input, ...) {
+ input <- as.data.table(unclass(input))
+ colnames(input) <- toupper(colnames(input))
+ if (is.element("PEPTIDEMODIFIEDSEQUENCE", colnames(input))) {
+ data.table::setnames(
+ input, "PEPTIDEMODIFIEDSEQUENCE", "PEPTIDESEQUENCE"
+ )
+ }
+ input$PEPTIDE <- paste(input$PEPTIDESEQUENCE, input$PRECURSORCHARGE, sep = "_")
+ input$TRANSITION <- paste(input$FRAGMENTION, input$PRODUCTCHARGE, sep = "_")
+ input$ISOTOPELABELTYPE <- factor(input$ISOTOPELABELTYPE)
+ if (data.table::uniqueN(input$ISOTOPELABELTYPE) == 2) {
+ levels(input$ISOTOPELABELTYPE) <- c("H", "L")
+ } else {
+ levels(input$ISOTOPELABELTYPE) <- "L"
+ }
+ input
}
-setGeneric(".checkDataValidity",
- function(input, ...) standardGeneric(".checkDataValidity"))
+setGeneric(
+ ".checkDataValidity",
+ function(input, ...) standardGeneric(".checkDataValidity")
+)
setMethod(".checkDataValidity", "data.frame", .checkUnProcessedDataValidity)
setMethod(".checkDataValidity", "MSstatsValidated", .prepareForDataProcess)
@@ -230,108 +262,123 @@ setMethod(".checkDataValidity", "MSstatsValidated", .prepareForDataProcess)
#' @param input data.table
#' @param log_base base of the logarithm
#' @keywords internal
-.preProcessIntensities = function(input, log_base) {
- INTENSITY = ABUNDANCE = NULL
-
- if (any(!is.na(input$INTENSITY) & input$INTENSITY < 1, na.rm = TRUE)) {
- n_smaller_than_1 = sum(!is.na(input$INTENSITY) & input$INTENSITY < 1,
- na.rm = TRUE)
- input[, INTENSITY := ifelse(!is.na(INTENSITY) & INTENSITY < 1,
- 1, INTENSITY)]
- msg = paste("** There are", n_smaller_than_1,
- "intensities which are zero or less than 1.",
- "These intensities are replaced with 1",
- collapse = " ")
- getOption("MSstatsLog")("INFO", msg)
- getOption("MSstatsMsg")("INFO", msg)
- }
- input[, ABUNDANCE := log(INTENSITY, log_base)]
- getOption("MSstatsLog")("INFO",
- paste("Logarithm transformation with base",
- log_base,
- "is done",
- collapse = " "))
+.preProcessIntensities <- function(input, log_base) {
+ INTENSITY <- ABUNDANCE <- NULL
+
+ if (any(!is.na(input$INTENSITY) & input$INTENSITY < 1, na.rm = TRUE)) {
+ n_smaller_than_1 <- sum(!is.na(input$INTENSITY) & input$INTENSITY < 1,
+ na.rm = TRUE
+ )
+ input[, INTENSITY := ifelse(!is.na(INTENSITY) & INTENSITY < 1,
+ 1, INTENSITY
+ )]
+ msg <- paste("** There are", n_smaller_than_1,
+ "intensities which are zero or less than 1.",
+ "These intensities are replaced with 1",
+ collapse = " "
+ )
+ getOption("MSstatsLog")("INFO", msg)
+ getOption("MSstatsMsg")("INFO", msg)
+ }
+ input[, ABUNDANCE := log(INTENSITY, log_base)]
+ getOption("MSstatsLog")("INFO",
+ paste("Logarithm transformation with base",
+ log_base,
+ "is done",
+ collapse = " "
+ ))
}
#' Create columns for data processing
#' @param input data.table
#' @keywords internal
-.updateColumnsForProcessing = function(input) {
- FEATURE = PEPTIDE = TRANSITION = GROUP = LABEL = GROUP_ORIGINAL = NULL
- SUBJECT = SUBJECT_ORIGINAL = PROTEIN = NULL
-
- data.table::setnames(
- input, c("PROTEINNAME", "ISOTOPELABELTYPE", "CONDITION", "BIOREPLICATE"),
- c("PROTEIN", "LABEL", "GROUP_ORIGINAL", "SUBJECT_ORIGINAL"),
- skip_absent = TRUE)
-
- input[, FEATURE := paste(PEPTIDE, TRANSITION, sep = "_")]
- input[, GROUP := ifelse(LABEL == "L", GROUP_ORIGINAL, "0")]
- input[, SUBJECT := ifelse(LABEL == "L", SUBJECT_ORIGINAL, "0")]
-
- cols = c("PROTEIN", "PEPTIDE", "TRANSITION", "FEATURE", "LABEL",
- "GROUP_ORIGINAL", "SUBJECT_ORIGINAL", "RUN", "GROUP",
- "SUBJECT", "FRACTION", "INTENSITY")
- if ("TECHREPLICATE" %in% colnames(input)) {
- cols = unique(c(cols, "TECHREPLICATE"))
- }
- input[!is.na(PROTEIN) & PROTEIN != "", cols, with = FALSE]
+.updateColumnsForProcessing <- function(input) {
+ FEATURE <- PEPTIDE <- TRANSITION <- GROUP <- LABEL <- GROUP_ORIGINAL <- NULL
+ SUBJECT <- SUBJECT_ORIGINAL <- PROTEIN <- NULL
+
+ data.table::setnames(
+ input, c("PROTEINNAME", "ISOTOPELABELTYPE", "CONDITION", "BIOREPLICATE"),
+ c("PROTEIN", "LABEL", "GROUP_ORIGINAL", "SUBJECT_ORIGINAL"),
+ skip_absent = TRUE
+ )
+
+ input[, FEATURE := paste(PEPTIDE, TRANSITION, sep = "_")]
+ input[, GROUP := ifelse(LABEL == "L", GROUP_ORIGINAL, "0")]
+ input[, SUBJECT := ifelse(LABEL == "L", SUBJECT_ORIGINAL, "0")]
+
+ cols <- c(
+ "PROTEIN", "PEPTIDE", "TRANSITION", "FEATURE", "LABEL",
+ "GROUP_ORIGINAL", "SUBJECT_ORIGINAL", "RUN", "GROUP",
+ "SUBJECT", "FRACTION", "INTENSITY"
+ )
+ if ("TECHREPLICATE" %in% colnames(input)) {
+ cols <- unique(c(cols, "TECHREPLICATE"))
+ }
+ input[!is.na(PROTEIN) & PROTEIN != "", cols, with = FALSE]
}
#' Make factor columns where needed
#' @param input data.table
#' @keywords internal
-.makeFactorColumns = function(input) {
- PROTEIN = PEPTIDE = TRANSITION = LABEL = GROUP_ORIGINAL = RUN = GROUP = NULL
- SUBJECT_ORIGINAL = FEATURE = originalRUN = SUBJECT = NULL
-
- input[, PROTEIN := factor(PROTEIN)]
- input[, PEPTIDE := factor(PEPTIDE)]
- input[, TRANSITION := factor(TRANSITION)]
- input = input[order(LABEL, GROUP_ORIGINAL, SUBJECT_ORIGINAL,
- RUN, PROTEIN, PEPTIDE, TRANSITION), ]
- input[, GROUP := factor(GROUP)]
- input[, SUBJECT := factor(SUBJECT)]
- input[, FEATURE := factor(FEATURE)]
- input[, originalRUN := factor(as.character(RUN))]
- input[, RUN := factor(RUN, levels = unique(RUN),
- labels = seq_along(unique(RUN)))]
-
- msg = paste("Factorize in columns(GROUP, SUBJECT, GROUP_ORIGINAL,",
- "SUBJECT_ORIGINAL, FEATURE, RUN)")
- getOption("MSstatsLog")("INFO", msg)
- input
+.makeFactorColumns <- function(input) {
+ PROTEIN <- PEPTIDE <- TRANSITION <- LABEL <- GROUP_ORIGINAL <- RUN <- GROUP <- NULL
+ SUBJECT_ORIGINAL <- FEATURE <- originalRUN <- SUBJECT <- NULL
+
+ input[, PROTEIN := factor(PROTEIN)]
+ input[, PEPTIDE := factor(PEPTIDE)]
+ input[, TRANSITION := factor(TRANSITION)]
+ input <- input[order(
+ LABEL, GROUP_ORIGINAL, SUBJECT_ORIGINAL,
+ RUN, PROTEIN, PEPTIDE, TRANSITION
+ ), ]
+ input[, GROUP := factor(GROUP)]
+ input[, SUBJECT := factor(SUBJECT)]
+ input[, FEATURE := factor(FEATURE)]
+ input[, originalRUN := factor(as.character(RUN))]
+ input[, RUN := factor(RUN,
+ levels = unique(RUN),
+ labels = seq_along(unique(RUN))
+ )]
+
+ msg <- paste(
+ "Factorize in columns(GROUP, SUBJECT, GROUP_ORIGINAL,",
+ "SUBJECT_ORIGINAL, FEATURE, RUN)"
+ )
+ getOption("MSstatsLog")("INFO", msg)
+ input
}
#' Prepare a peptides dictionary for global standards normalization
-#'
+#'
#' @param input `data.table` in MSstats standard format
#' @param normalization normalization method
-#'
+#'
#' @details This function extracts information required to perform normalization
#' with global standards. It is useful for running the summarization workflow
#' outside of the dataProcess function.
-#'
+#'
#' @export
-#'
-#' @examples
-#' input = data.table::as.data.table(DDARawData)
-#' peptides_dict = makePeptidesDictionary(input, "GLOBALSTANDARDS")
+#'
+#' @examples
+#' input <- data.table::as.data.table(DDARawData)
+#' peptides_dict <- makePeptidesDictionary(input, "GLOBALSTANDARDS")
#' head(peptides_dict) # ready to be passed to the MSstatsNormalize function
-#'
-makePeptidesDictionary = function(input, normalization) {
- PEPTIDE = PeptideSequence = PrecursorCharge = NULL
-
- if (toupper(normalization) == "GLOBALSTANDARDS") {
- cols = intersect(c("PeptideSequence", "PeptideModifiedSequence",
- "PrecursorCharge"), colnames(input))
- peptides_dict = unique(input[, cols, with = FALSE])
- colnames(peptides_dict)[1] = "PeptideSequence"
- peptides_dict[, PEPTIDE := paste(PeptideSequence, PrecursorCharge, sep = "_")]
- } else {
- NULL
- }
+#'
+makePeptidesDictionary <- function(input, normalization) {
+ PEPTIDE <- PeptideSequence <- PrecursorCharge <- NULL
+
+ if (toupper(normalization) == "GLOBALSTANDARDS") {
+ cols <- intersect(c(
+ "PeptideSequence", "PeptideModifiedSequence",
+ "PrecursorCharge"
+ ), colnames(input))
+ peptides_dict <- unique(input[, cols, with = FALSE])
+ colnames(peptides_dict)[1] <- "PeptideSequence"
+ peptides_dict[, PEPTIDE := paste(PeptideSequence, PrecursorCharge, sep = "_")]
+ } else {
+ NULL
+ }
}
diff --git a/R/utils_dataprocess_plots.R b/R/utils_dataprocess_plots.R
index 17d8f9f8..60673cdf 100644
--- a/R/utils_dataprocess_plots.R
+++ b/R/utils_dataprocess_plots.R
@@ -1,19 +1,19 @@
#' Get name for y-axis
#' @param temp data.table
#' @keywords internal
-.getYaxis = function(temp) {
- INTENSITY = ABUNDANCE = NULL
-
- temp = temp[!is.na(INTENSITY) & !is.na(ABUNDANCE),]
- temp_abund = temp[1, "ABUNDANCE"]
- temp_inten = temp[1, "INTENSITY"]
- log2_diff = abs(log(temp_inten, 2) - temp_abund)
- log10_diff = abs(log(temp_inten, 10) - temp_abund)
- if (log2_diff < log10_diff) {
- "Log2-intensities"
- } else {
- "Log10-intensities"
- }
+.getYaxis <- function(temp) {
+ INTENSITY <- ABUNDANCE <- NULL
+
+ temp <- temp[!is.na(INTENSITY) & !is.na(ABUNDANCE), ]
+ temp_abund <- temp[1, "ABUNDANCE"]
+ temp_inten <- temp[1, "INTENSITY"]
+ log2_diff <- abs(log(temp_inten, 2) - temp_abund)
+ log10_diff <- abs(log(temp_inten, 10) - temp_abund)
+ if (log2_diff < log10_diff) {
+ "Log2-intensities"
+ } else {
+ "Log10-intensities"
+ }
}
#' Get data for a single protein to plot
@@ -21,15 +21,15 @@
#' @param all_proteins character, set of protein names
#' @param i integer, index of protein to use
#' @keywords internal
-.getSingleProteinForProfile = function(processed, all_proteins, i) {
- FEATURE = SUBJECT = GROUP = PEPTIDE = NULL
-
- single_protein = processed[processed$PROTEIN == all_proteins[i], ]
- single_protein[, FEATURE := factor(FEATURE)]
- single_protein[, SUBJECT := factor(SUBJECT)]
- single_protein[, GROUP := factor(GROUP)]
- single_protein[, PEPTIDE := factor(PEPTIDE)]
- single_protein
+.getSingleProteinForProfile <- function(processed, all_proteins, i) {
+ FEATURE <- SUBJECT <- GROUP <- PEPTIDE <- NULL
+
+ single_protein <- processed[processed$PROTEIN == all_proteins[i], ]
+ single_protein[, FEATURE := factor(FEATURE)]
+ single_protein[, SUBJECT := factor(SUBJECT)]
+ single_protein[, GROUP := factor(GROUP)]
+ single_protein[, PEPTIDE := factor(PEPTIDE)]
+ single_protein
}
@@ -38,108 +38,127 @@
#' @param input data.table
#' @param is_censored TRUE if censored values were imputed
#' @keywords internal
-.makeProfilePlot = function(
- input, is_censored, featureName, y.limdown, y.limup, x.axis.size,
- y.axis.size, text.size, text.angle, legend.size, dot.size.profile,
- ss, s, cumGroupAxis, yaxis.name, lineNameAxis, groupNametemp, dot_colors
-) {
- RUN = ABUNDANCE = Name = NULL
-
- if (is_censored) {
- input$is_censored = factor(input$is_censored,
- levels = c("FALSE", "TRUE"))
- }
- featureName = toupper(featureName)
- if (featureName == "TRANSITION") {
- type_color = "FEATURE"
- } else {
- type_color = "PEPTIDE"
- }
-
- profile_plot = ggplot(input, aes_string(x = "RUN", y = "newABUNDANCE",
- color = type_color, linetype = "FEATURE")) +
- facet_grid(~LABEL) +
- geom_line(size = 0.5)
-
+.makeProfilePlot <- function(
+ input, is_censored, featureName, y.limdown, y.limup, x.axis.size,
+ y.axis.size, text.size, text.angle, legend.size, dot.size.profile,
+ ss, s, cumGroupAxis, yaxis.name, lineNameAxis, groupNametemp, dot_colors) {
+ RUN <- ABUNDANCE <- Name <- NULL
+
+ if (is_censored) {
+ input$is_censored <- factor(input$is_censored,
+ levels = c("FALSE", "TRUE")
+ )
+ }
+ featureName <- toupper(featureName)
+ if (featureName == "TRANSITION") {
+ type_color <- "FEATURE"
+ } else {
+ type_color <- "PEPTIDE"
+ }
+
+ profile_plot <- ggplot(input, aes_string(
+ x = "RUN", y = "newABUNDANCE",
+ color = type_color, linetype = "FEATURE"
+ )) +
+ facet_grid(~LABEL) +
+ geom_line(size = 0.5)
+
+ if (is_censored) {
+ profile_plot <- profile_plot +
+ geom_point(aes_string(x = "RUN", y = "newABUNDANCE", color = type_color, shape = "censored"),
+ data = input,
+ size = dot.size.profile
+ ) +
+ scale_shape_manual(
+ values = c(16, 1),
+ labels = c("Detected data", "Censored missing data")
+ )
+ } else {
+ profile_plot <- profile_plot +
+ geom_point(size = dot.size.profile) +
+ scale_shape_manual(values = c(16))
+ }
+
+
+ if (featureName == "TRANSITION") {
+ profile_plot <- profile_plot +
+ scale_colour_manual(values = dot_colors[s])
+ } else if (featureName == "PEPTIDE") {
+ profile_plot <- profile_plot +
+ scale_colour_manual(values = dot_colors[seq_along(unique(s))])
+ } else if (featureName == "NA") {
if (is_censored) {
- profile_plot = profile_plot +
- geom_point(aes_string(x = "RUN", y = "newABUNDANCE", color = type_color, shape = "censored"),
- data = input,
- size = dot.size.profile) +
- scale_shape_manual(values = c(16, 1),
- labels = c("Detected data", "Censored missing data"))
+ profile_plot <- profile_plot +
+ scale_colour_manual(values = dot_colors[seq_along(unique(s))])
} else {
- profile_plot = profile_plot +
- geom_point(size = dot.size.profile) +
- scale_shape_manual(values = c(16))
- }
-
-
- if (featureName == "TRANSITION") {
- profile_plot = profile_plot +
- scale_colour_manual(values = dot_colors[s])
- } else if (featureName == "PEPTIDE") {
- profile_plot = profile_plot +
- scale_colour_manual(values = dot_colors[seq_along(unique(s))])
- } else if (featureName == "NA") {
- if (is_censored) {
- profile_plot = profile_plot +
- scale_colour_manual(values = dot_colors[seq_along(unique(s))])
- } else {
- profile_plot = profile_plot +
- scale_colour_manual(values = dot_colors[s])
- }
- }
-
- profile_plot = profile_plot + scale_linetype_manual(values = ss, guide = "none")
- profile_plot = profile_plot +
- scale_x_continuous('MS runs', breaks = cumGroupAxis) +
- scale_y_continuous(yaxis.name, limits = c(y.limdown, y.limup)) +
- geom_vline(xintercept = lineNameAxis + 0.5, colour = "grey", linetype = "longdash") +
- labs(title = unique(input$PROTEIN)) +
- geom_text(data = groupNametemp, aes(x = RUN, y = ABUNDANCE, label = Name),
- size = text.size,
- angle = text.angle,
- color = "black") +
- theme_msstats("PROFILEPLOT", x.axis.size, y.axis.size, legend.size)
-
- if (featureName == "TRANSITION") {
- color_guide = guide_legend(order=1,
- override.aes = list(size=1.2,
- linetype = ss),
- title = paste("# peptide:", nlevels(input$PEPTIDE)),
- title.theme = element_text(size = 13, angle = 0),
- keywidth = 0.25,
- keyheight = 0.1,
- default.unit = 'inch',
- ncol = 3)
- } else if (featureName == "PEPTIDE") {
- color_guide = guide_legend(order=1,
- title = paste("# peptide:", nlevels(input$PEPTIDE)),
- title.theme = element_text(size = 13, angle = 0),
- keywidth = 0.25,
- keyheight = 0.1,
- default.unit = 'inch',
- ncol = 3)
+ profile_plot <- profile_plot +
+ scale_colour_manual(values = dot_colors[s])
}
- shape_guide = guide_legend(order=2,
- title = NULL,
- label.theme = element_text(size = 11, angle = 0),
- keywidth = 0.1,
- keyheight = 0.1,
- default.unit = 'inch')
- if (is_censored) {
- if (featureName == "NA") {
- profile_plot = profile_plot + guides(color = FALSE,
- shape = shape_guide)
- } else {
- profile_plot = profile_plot + guides(color = color_guide,
- shape = shape_guide)
- }
+ }
+
+ profile_plot <- profile_plot + scale_linetype_manual(values = ss, guide = "none")
+ profile_plot <- profile_plot +
+ scale_x_continuous("MS runs", breaks = cumGroupAxis) +
+ scale_y_continuous(yaxis.name, limits = c(y.limdown, y.limup)) +
+ geom_vline(xintercept = lineNameAxis + 0.5, colour = "grey", linetype = "longdash") +
+ labs(title = unique(input$PROTEIN)) +
+ geom_text(
+ data = groupNametemp, aes(x = RUN, y = ABUNDANCE, label = Name),
+ size = text.size,
+ angle = text.angle,
+ color = "black"
+ ) +
+ theme_msstats("PROFILEPLOT", x.axis.size, y.axis.size, legend.size)
+
+ if (featureName == "TRANSITION") {
+ color_guide <- guide_legend(
+ order = 1,
+ override.aes = list(
+ size = 1.2,
+ linetype = ss
+ ),
+ title = paste("# peptide:", nlevels(input$PEPTIDE)),
+ title.theme = element_text(size = 13, angle = 0),
+ keywidth = 0.25,
+ keyheight = 0.1,
+ default.unit = "inch",
+ ncol = 3
+ )
+ } else if (featureName == "PEPTIDE") {
+ color_guide <- guide_legend(
+ order = 1,
+ title = paste("# peptide:", nlevels(input$PEPTIDE)),
+ title.theme = element_text(size = 13, angle = 0),
+ keywidth = 0.25,
+ keyheight = 0.1,
+ default.unit = "inch",
+ ncol = 3
+ )
+ }
+ shape_guide <- guide_legend(
+ order = 2,
+ title = NULL,
+ label.theme = element_text(size = 11, angle = 0),
+ keywidth = 0.1,
+ keyheight = 0.1,
+ default.unit = "inch"
+ )
+ if (is_censored) {
+ if (featureName == "NA") {
+ profile_plot <- profile_plot + guides(
+ color = FALSE,
+ shape = shape_guide
+ )
} else {
- profile_plot = profile_plot + guides(color = color_guide)
+ profile_plot <- profile_plot + guides(
+ color = color_guide,
+ shape = shape_guide
+ )
}
- profile_plot
+ } else {
+ profile_plot <- profile_plot + guides(color = color_guide)
+ }
+ profile_plot
}
@@ -147,72 +166,101 @@
#' @inheritParams dataProcessPlots
#' @inheritParams .makeProfilePlot
#' @keywords internal
-.makeSummaryProfilePlot = function(
- input, is_censored, y.limdown, y.limup, x.axis.size, y.axis.size,
- text.size, text.angle, legend.size, dot.size.profile, cumGroupAxis,
- yaxis.name, lineNameAxis, groupNametemp
-) {
- RUN = ABUNDANCE = Name = NULL
-
- num_features = data.table::uniqueN(input$FEATURE)
- profile_plot = ggplot(data = input,
- aes_string(x = "RUN", y = "newABUNDANCE",
- color = "analysis", linetype = "FEATURE",
- size = "analysis")) +
- facet_grid(~LABEL) +
- geom_line(size = 0.5)
-
- if (is_censored) { # splitting into two layers to keep red above grey
- profile_plot = profile_plot +
- geom_point(data = input[input$PEPTIDE != "Run summary"],
- aes_string(x = "RUN", y = "newABUNDANCE",
- color = "analysis", size = "analysis",
- shape = "censored")) +
- geom_point(data = input[input$PEPTIDE == "Run summary"],
- aes_string(x = "RUN", y = "newABUNDANCE",
- color = "analysis", size = "analysis",
- shape = "censored")) +
- scale_shape_manual(values = c(16, 1),
- labels = c("Detected data",
- "Censored missing data"))
- } else {
- profile_plot = profile_plot +
- geom_point(size = dot.size.profile) +
- scale_shape_manual(values = c(16))
- }
-
- profile_plot = profile_plot +
- scale_colour_manual(values = c("lightgray", "darkred")) +
- scale_size_manual(values = c(1.7, 2), guide = "none") +
- scale_linetype_manual(values = c(rep(1, times = num_features - 1), 2),
- guide = "none") +
- scale_x_continuous("MS runs", breaks = cumGroupAxis) +
- scale_y_continuous(yaxis.name, limits = c(y.limdown, y.limup)) +
- geom_vline(xintercept = lineNameAxis + 0.5,
- colour = "grey", linetype = "longdash") +
- labs(title = unique(input$PROTEIN)) +
- geom_text(data = groupNametemp, aes(x = RUN, y = ABUNDANCE, label = Name),
- size = text.size,
- angle = text.angle,
- color = "black") +
- theme_msstats("PROFILEPLOT", x.axis.size, y.axis.size,
- legend.size, legend.title = element_blank())
- color_guide = guide_legend(order = 1,
- title = NULL,
- label.theme = element_text(size = 10, angle = 0))
- shape_guide = guide_legend(order = 2,
- title = NULL,
- label.theme = element_text(size = 10, angle = 0))
- if (is_censored) {
- profile_plot = profile_plot +
- guides(color = color_guide, shape = shape_guide)
- } else {
- profile_plot = profile_plot +
- guides(color = color_guide) +
- geom_point(aes_string(x = "RUN", y = "newABUNDANCE", size = "analysis",
- color = "analysis"), data = input)
- }
- profile_plot
+.makeSummaryProfilePlot <- function(
+ input, is_censored, y.limdown, y.limup, x.axis.size, y.axis.size,
+ text.size, text.angle, legend.size, dot.size.profile, cumGroupAxis,
+ yaxis.name, lineNameAxis, groupNametemp) {
+ RUN <- ABUNDANCE <- Name <- NULL
+
+ num_features <- data.table::uniqueN(input$FEATURE)
+ profile_plot <- ggplot(
+ data = input,
+ aes_string(
+ x = "RUN", y = "newABUNDANCE",
+ color = "analysis", linetype = "FEATURE",
+ size = "analysis"
+ )
+ ) +
+ facet_grid(~LABEL) +
+ geom_line(size = 0.5)
+
+ if (is_censored) { # splitting into two layers to keep red above grey
+ profile_plot <- profile_plot +
+ geom_point(
+ data = input[input$PEPTIDE != "Run summary"],
+ aes_string(
+ x = "RUN", y = "newABUNDANCE",
+ color = "analysis", size = "analysis",
+ shape = "censored"
+ )
+ ) +
+ geom_point(
+ data = input[input$PEPTIDE == "Run summary"],
+ aes_string(
+ x = "RUN", y = "newABUNDANCE",
+ color = "analysis", size = "analysis",
+ shape = "censored"
+ )
+ ) +
+ scale_shape_manual(
+ values = c(16, 1),
+ labels = c(
+ "Detected data",
+ "Censored missing data"
+ )
+ )
+ } else {
+ profile_plot <- profile_plot +
+ geom_point(size = dot.size.profile) +
+ scale_shape_manual(values = c(16))
+ }
+
+ profile_plot <- profile_plot +
+ scale_colour_manual(values = c("lightgray", "darkred")) +
+ scale_size_manual(values = c(1.7, 2), guide = "none") +
+ scale_linetype_manual(
+ values = c(rep(1, times = num_features - 1), 2),
+ guide = "none"
+ ) +
+ scale_x_continuous("MS runs", breaks = cumGroupAxis) +
+ scale_y_continuous(yaxis.name, limits = c(y.limdown, y.limup)) +
+ geom_vline(
+ xintercept = lineNameAxis + 0.5,
+ colour = "grey", linetype = "longdash"
+ ) +
+ labs(title = unique(input$PROTEIN)) +
+ geom_text(
+ data = groupNametemp, aes(x = RUN, y = ABUNDANCE, label = Name),
+ size = text.size,
+ angle = text.angle,
+ color = "black"
+ ) +
+ theme_msstats("PROFILEPLOT", x.axis.size, y.axis.size,
+ legend.size,
+ legend.title = element_blank()
+ )
+ color_guide <- guide_legend(
+ order = 1,
+ title = NULL,
+ label.theme = element_text(size = 10, angle = 0)
+ )
+ shape_guide <- guide_legend(
+ order = 2,
+ title = NULL,
+ label.theme = element_text(size = 10, angle = 0)
+ )
+ if (is_censored) {
+ profile_plot <- profile_plot +
+ guides(color = color_guide, shape = shape_guide)
+ } else {
+ profile_plot <- profile_plot +
+ guides(color = color_guide) +
+ geom_point(aes_string(
+ x = "RUN", y = "newABUNDANCE", size = "analysis",
+ color = "analysis"
+ ), data = input)
+ }
+ profile_plot
}
@@ -221,34 +269,39 @@
#' @param input data.table
#' @param all_proteins character vector of protein names
#' @keywords internal
-.makeQCPlot = function(
- input, all_proteins, y.limdown, y.limup, x.axis.size, y.axis.size,
+.makeQCPlot <- function(
+ input, all_proteins, y.limdown, y.limup, x.axis.size, y.axis.size,
text.size, text.angle, legend.size, label.color, cumGroupAxis, groupName,
- lineNameAxis, yaxis.name
-) {
- RUN = ABUNDANCE = Name = NULL
-
- if (all_proteins) {
- plot_title = "All proteins"
- } else {
- plot_title = unique(input$PROTEIN)
- }
-
- ggplot(input, aes_string(x = "RUN", y = "ABUNDANCE")) +
- facet_grid(~LABEL) +
- geom_boxplot(aes_string(fill = "LABEL"), outlier.shape = 1,
- outlier.size = 1.5) +
- scale_fill_manual(values = label.color, guide = "none") +
- scale_x_discrete("MS runs", breaks = cumGroupAxis) +
- scale_y_continuous(yaxis.name, limits = c(y.limdown, y.limup)) +
- geom_vline(xintercept = lineNameAxis + 0.5, colour = "grey",
- linetype = "longdash") +
- labs(title = plot_title) +
- geom_text(data = groupName, aes(x = RUN, y = ABUNDANCE, label = Name),
- size = text.size, angle = text.angle, color = "black") +
- theme_msstats("QCPLOT", x.axis.size, y.axis.size,
- legend_size = NULL)
-
+ lineNameAxis, yaxis.name) {
+ RUN <- ABUNDANCE <- Name <- NULL
+
+ if (all_proteins) {
+ plot_title <- "All proteins"
+ } else {
+ plot_title <- unique(input$PROTEIN)
+ }
+
+ ggplot(input, aes_string(x = "RUN", y = "ABUNDANCE")) +
+ facet_grid(~LABEL) +
+ geom_boxplot(aes_string(fill = "LABEL"),
+ outlier.shape = 1,
+ outlier.size = 1.5
+ ) +
+ scale_fill_manual(values = label.color, guide = "none") +
+ scale_x_discrete("MS runs", breaks = cumGroupAxis) +
+ scale_y_continuous(yaxis.name, limits = c(y.limdown, y.limup)) +
+ geom_vline(
+ xintercept = lineNameAxis + 0.5, colour = "grey",
+ linetype = "longdash"
+ ) +
+ labs(title = plot_title) +
+ geom_text(
+ data = groupName, aes(x = RUN, y = ABUNDANCE, label = Name),
+ size = text.size, angle = text.angle, color = "black"
+ ) +
+ theme_msstats("QCPLOT", x.axis.size, y.axis.size,
+ legend_size = NULL
+ )
}
@@ -257,35 +310,40 @@
#' @param input data.table
#' @param single_protein data.table
#' @keywords internal
-.makeConditionPlot = function(
- input, scale, single_protein, y.limdown, y.limup, x.axis.size, y.axis.size,
- text.size, text.angle, legend.size, dot.size.condition, yaxis.name
-) {
- Mean = ciw = NULL
-
- colnames(input)[colnames(input) == "GROUP"] = "Label"
- if (scale) {
- input$Label = as.numeric(gsub("\\D", "", unique(input$Label)))
- }
-
- plot = ggplot(aes_string(x = "Label", y = "Mean"), data = input) +
- geom_errorbar(aes(ymax = Mean + ciw, ymin = Mean - ciw),
- data = input, width = 0.1, colour = "red") +
- geom_point(size = dot.size.condition, colour = "darkred")
-
- if (!scale) {
- plot = plot + scale_x_discrete("Condition")
- } else {
- plot = plot + scale_x_continuous("Condition", breaks = input$Label,
- labels = input$Label)
- }
-
- plot = plot +
- scale_y_continuous(yaxis.name, limits = c(y.limdown, y.limup)) +
- geom_hline(yintercept = 0, linetype = "twodash",
- colour = "darkgrey", size = 0.6) +
- labs(title = unique(single_protein$PROTEIN)) +
- theme_msstats("CONDITIONPLOT", x.axis.size, y.axis.size,
- text_angle = text.angle)
- plot
+.makeConditionPlot <- function(
+ input, scale, single_protein, y.limdown, y.limup, x.axis.size, y.axis.size,
+ text.size, text.angle, legend.size, dot.size.condition, yaxis.name) {
+ Mean <- ciw <- NULL
+
+ colnames(input)[colnames(input) == "GROUP"] <- "Label"
+ if (scale) {
+ input$Label <- as.numeric(gsub("\\D", "", unique(input$Label)))
+ }
+
+ plot <- ggplot(aes_string(x = "Label", y = "Mean"), data = input) +
+ geom_errorbar(aes(ymax = Mean + ciw, ymin = Mean - ciw),
+ data = input, width = 0.1, colour = "red"
+ ) +
+ geom_point(size = dot.size.condition, colour = "darkred")
+
+ if (!scale) {
+ plot <- plot + scale_x_discrete("Condition")
+ } else {
+ plot <- plot + scale_x_continuous("Condition",
+ breaks = input$Label,
+ labels = input$Label
+ )
+ }
+
+ plot <- plot +
+ scale_y_continuous(yaxis.name, limits = c(y.limdown, y.limup)) +
+ geom_hline(
+ yintercept = 0, linetype = "twodash",
+ colour = "darkgrey", size = 0.6
+ ) +
+ labs(title = unique(single_protein$PROTEIN)) +
+ theme_msstats("CONDITIONPLOT", x.axis.size, y.axis.size,
+ text_angle = text.angle
+ )
+ plot
}
diff --git a/R/utils_documentation.R b/R/utils_documentation.R
index 2cc03d61..be4807cc 100644
--- a/R/utils_documentation.R
+++ b/R/utils_documentation.R
@@ -1,11 +1,11 @@
#' A dummy function to store shared documentation items.
-#'
+#'
#' @import data.table
-#' @importFrom MSstatsConvert MSstatsImport MSstatsClean MSstatsPreprocess
+#' @importFrom MSstatsConvert MSstatsImport MSstatsClean MSstatsPreprocess
#' MSstatsBalancedDesign MSstatsMakeAnnotation MSstatsLogsSettings
-#'
+#'
#' @param removeFewMeasurements TRUE (default) will remove the features that have 1 or 2 measurements across runs.
-#' @param useUniquePeptide TRUE (default) removes peptides that are assigned for more than one proteins.
+#' @param useUniquePeptide TRUE (default) removes peptides that are assigned for more than one proteins.
#' We assume to use unique peptide for each protein.
#' @param summaryforMultipleRows max(default) or sum - when there are multiple measurements for certain feature and certain run, use highest or sum of multiple intensities.
#' @param removeProtein_with1Feature TRUE will remove the proteins which have only 1 feature, which is the combination of peptide, precursor charge, fragment and charge. FALSE is default.
@@ -18,13 +18,13 @@
#' to an existing log file.
#' @param verbose logical. If TRUE, information about data processing wil be printed
#' to the console.
-#' @param log_file_path character. Path to a file to which information about
-#' data processing will be saved.
+#' @param log_file_path character. Path to a file to which information about
+#' data processing will be saved.
#' If not provided, such a file will be created automatically.
#' If `append = TRUE`, has to be a valid path to a file.
-#'
+#'
#' @keywords internal
-#'
-.documentFunction = function() {
- NULL
-}
\ No newline at end of file
+#'
+.documentFunction <- function() {
+ NULL
+}
diff --git a/R/utils_feature_selection.R b/R/utils_feature_selection.R
index b11b6f16..f8b1e4dc 100644
--- a/R/utils_feature_selection.R
+++ b/R/utils_feature_selection.R
@@ -1,53 +1,57 @@
#' Feature selection before feature-level data summarization
-#'
+#'
#' @param input data.table
#' @param method "all" / "highQuality", "topN"
#' @param top_n number of features to use for "topN" method
#' @param min_feature_count number of quality features for "highQuality" method
-#'
+#'
#' @return data.table
-#'
+#'
#' @export
-#'
-#' @examples
-#' raw = DDARawData
-#' method = "TMP"
-#' cens = "NA"
-#' impute = TRUE
+#'
+#' @examples
+#' raw <- DDARawData
+#' method <- "TMP"
+#' cens <- "NA"
+#' impute <- TRUE
#' MSstatsConvert::MSstatsLogsSettings(FALSE)
-#' input = MSstatsPrepareForDataProcess(raw, 2, NULL)
-#' input = MSstatsNormalize(input, "EQUALIZEMEDIANS")
-#' input = MSstatsMergeFractions(input)
-#' input = MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
-#' input_all = MSstatsSelectFeatures(input, "all") # all features
-#' input_5 = MSstatsSelectFeatures(data.table::copy(input), "topN", top_n = 5) # top 5 features
-#' input_informative = MSstatsSelectFeatures(input, "highQuality") # feature selection
-#'
+#' input <- MSstatsPrepareForDataProcess(raw, 2, NULL)
+#' input <- MSstatsNormalize(input, "EQUALIZEMEDIANS")
+#' input <- MSstatsMergeFractions(input)
+#' input <- MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
+#' input_all <- MSstatsSelectFeatures(input, "all") # all features
+#' input_5 <- MSstatsSelectFeatures(data.table::copy(input), "topN", top_n = 5) # top 5 features
+#' input_informative <- MSstatsSelectFeatures(input, "highQuality") # feature selection
+#'
#' head(input_all)
#' head(input_5)
#' head(input_informative)
-#'
-MSstatsSelectFeatures = function(input, method, top_n = 3, min_feature_count = 2) {
- checkmate::assertChoice(method, c("all", "top3", "topN", "highQuality"))
- if (method == "all") {
- msg = "** Use all features that the dataset originally has."
- } else if (method == "highQuality") {
- msg = "** Flag uninformative feature and outliers by feature selection algorithm."
- features_quality = .selectHighQualityFeatures(input, min_feature_count)
- input = merge(input, features_quality, all.x = TRUE,
- by.x = c("LABEL", "PROTEIN", "FEATURE", "originalRUN"),
- by.y = c("label", "protein", "feature", "run"))
- input$feature_quality = ifelse(is.na(input$feature_quality),
- "Informative", input$feature_quality)
- input$is_outlier = ifelse(is.na(input$is_outlier),
- FALSE, input$is_outlier)
- } else if (method %in% c("top3", "topN")) {
- msg = paste0("** Use top", top_n, " features that have highest average of log2(intensity) across runs.")
- input = .selectTopFeatures(input, top_n)
- }
- getOption("MSstatsLog")("INFO", msg)
- getOption("MSstatsMsg")("INFO", msg)
- input
+#'
+MSstatsSelectFeatures <- function(input, method, top_n = 3, min_feature_count = 2) {
+ checkmate::assertChoice(method, c("all", "top3", "topN", "highQuality"))
+ if (method == "all") {
+ msg <- "** Use all features that the dataset originally has."
+ } else if (method == "highQuality") {
+ msg <- "** Flag uninformative feature and outliers by feature selection algorithm."
+ features_quality <- .selectHighQualityFeatures(input, min_feature_count)
+ input <- merge(input, features_quality,
+ all.x = TRUE,
+ by.x = c("LABEL", "PROTEIN", "FEATURE", "originalRUN"),
+ by.y = c("label", "protein", "feature", "run")
+ )
+ input$feature_quality <- ifelse(is.na(input$feature_quality),
+ "Informative", input$feature_quality
+ )
+ input$is_outlier <- ifelse(is.na(input$is_outlier),
+ FALSE, input$is_outlier
+ )
+ } else if (method %in% c("top3", "topN")) {
+ msg <- paste0("** Use top", top_n, " features that have highest average of log2(intensity) across runs.")
+ input <- .selectTopFeatures(input, top_n)
+ }
+ getOption("MSstatsLog")("INFO", msg)
+ getOption("MSstatsMsg")("INFO", msg)
+ input
}
@@ -56,16 +60,17 @@ MSstatsSelectFeatures = function(input, method, top_n = 3, min_feature_count = 2
#' @param top_n number of top features to select
#' @return data.table
#' @keywords internal
-.selectTopFeatures = function(input, top_n) {
- ABUNDANCE = MeanAbundance = remove = FEATURE = feature_rank = NULL
-
- mean_by_feature = input[ABUNDANCE > 0,
- list(MeanAbundance = mean(ABUNDANCE, na.rm = TRUE)),
- by = c("PROTEIN", "FEATURE")]
- mean_by_feature[, feature_rank := rank(-MeanAbundance), by = "PROTEIN"]
- mean_by_feature = mean_by_feature[feature_rank <= top_n, ]
- input[, remove := !(FEATURE %in% mean_by_feature$FEATURE)]
- input
+.selectTopFeatures <- function(input, top_n) {
+ ABUNDANCE <- MeanAbundance <- remove <- FEATURE <- feature_rank <- NULL
+
+ mean_by_feature <- input[ABUNDANCE > 0,
+ list(MeanAbundance = mean(ABUNDANCE, na.rm = TRUE)),
+ by = c("PROTEIN", "FEATURE")
+ ]
+ mean_by_feature[, feature_rank := rank(-MeanAbundance), by = "PROTEIN"]
+ mean_by_feature <- mean_by_feature[feature_rank <= top_n, ]
+ input[, remove := !(FEATURE %in% mean_by_feature$FEATURE)]
+ input
}
#' Select features of high quality
@@ -73,33 +78,39 @@ MSstatsSelectFeatures = function(input, method, top_n = 3, min_feature_count = 2
#' @param min_feature_count minimum number of quality features to consider
#' @return data.table
#' @keywords internal
-.selectHighQualityFeatures = function(input, min_feature_count) {
- PROTEIN = PEPTIDE = FEATURE = originalRUN = ABUNDANCE = is_censored = NULL
- is_obs = log2inty = LABEL = NULL
-
- cols = c("PROTEIN", "PEPTIDE", "FEATURE", "originalRUN", "LABEL",
- "ABUNDANCE", "censored")
- cols = intersect(cols, colnames(input))
- input = input[, cols, with = FALSE]
- if (!("censored" %in% cols)) {
- input$censored = FALSE
- }
- data.table::setnames(input, "censored", "is_censored")
- input = input[, list(protein = as.character(PROTEIN),
- peptide = as.character(PEPTIDE),
- feature = as.character(FEATURE),
- run = as.character(originalRUN),
- label = as.character(LABEL),
- log2inty = ifelse(!(is.na(ABUNDANCE) | is_censored),
- ABUNDANCE, NA),
- is_censored)]
- input[, is_obs := !(is.na(log2inty) | is_censored)]
- input[, is_censored := NULL]
-
- features_quality = data.table::rbindlist(lapply(split(input, input$label),
- .flagUninformativeSingleLabel,
- min_feature_count = min_feature_count))
- features_quality
+.selectHighQualityFeatures <- function(input, min_feature_count) {
+ PROTEIN <- PEPTIDE <- FEATURE <- originalRUN <- ABUNDANCE <- is_censored <- NULL
+ is_obs <- log2inty <- LABEL <- NULL
+
+ cols <- c(
+ "PROTEIN", "PEPTIDE", "FEATURE", "originalRUN", "LABEL",
+ "ABUNDANCE", "censored"
+ )
+ cols <- intersect(cols, colnames(input))
+ input <- input[, cols, with = FALSE]
+ if (!("censored" %in% cols)) {
+ input$censored <- FALSE
+ }
+ data.table::setnames(input, "censored", "is_censored")
+ input <- input[, list(
+ protein = as.character(PROTEIN),
+ peptide = as.character(PEPTIDE),
+ feature = as.character(FEATURE),
+ run = as.character(originalRUN),
+ label = as.character(LABEL),
+ log2inty = ifelse(!(is.na(ABUNDANCE) | is_censored),
+ ABUNDANCE, NA
+ ),
+ is_censored
+ )]
+ input[, is_obs := !(is.na(log2inty) | is_censored)]
+ input[, is_censored := NULL]
+
+ features_quality <- data.table::rbindlist(lapply(split(input, input$label),
+ .flagUninformativeSingleLabel,
+ min_feature_count = min_feature_count
+ ))
+ features_quality
}
@@ -107,35 +118,36 @@ MSstatsSelectFeatures = function(input, method, top_n = 3, min_feature_count = 2
#' @inheritParams .selectHighQualityFeatures
#' @return data.table
#' @keywords internal
-.flagUninformativeSingleLabel = function(input, min_feature_count = 2) {
- log2inty = is_obs = unrep = n_observed = NULL
- label = protein = feature = run = feature_quality = is_outlier = NULL
-
- if (nrow(input) == 0) {
- return(NULL)
- }
- if (unique(input$label) == "H") {
- input = input[log2inty > 0, ]
- }
-
- input[, n_observed := sum(is_obs), by = c("protein", "feature")]
- input[, unrep := n_observed <= 1, ]
- .addOutlierCutoff(input)
- .addNInformativeInfo(input, min_feature_count, "unrep")
- .addCoverageInfo(input)
- .addNInformativeInfo(input, min_feature_count, "is_lowcvr")
- .addModelInformation(input)
- input = .addModelVariances(input)
- input = .addNoisyFlag(input)
-
- input$feature_quality = ifelse(
- !input$unrep & !input$is_lowcvr & !input$is_noisy,
- "Informative", "Uninformative"
- )
- input$is_outlier = ifelse(input$label == "H" & input$log2inty <= 0,
- TRUE, input$is_outlier)
- input = unique(input[, list(label, protein, feature, run, feature_quality, is_outlier)])
- input
+.flagUninformativeSingleLabel <- function(input, min_feature_count = 2) {
+ log2inty <- is_obs <- unrep <- n_observed <- NULL
+ label <- protein <- feature <- run <- feature_quality <- is_outlier <- NULL
+
+ if (nrow(input) == 0) {
+ return(NULL)
+ }
+ if (unique(input$label) == "H") {
+ input <- input[log2inty > 0, ]
+ }
+
+ input[, n_observed := sum(is_obs), by = c("protein", "feature")]
+ input[, unrep := n_observed <= 1, ]
+ .addOutlierCutoff(input)
+ .addNInformativeInfo(input, min_feature_count, "unrep")
+ .addCoverageInfo(input)
+ .addNInformativeInfo(input, min_feature_count, "is_lowcvr")
+ .addModelInformation(input)
+ input <- .addModelVariances(input)
+ input <- .addNoisyFlag(input)
+
+ input$feature_quality <- ifelse(
+ !input$unrep & !input$is_lowcvr & !input$is_noisy,
+ "Informative", "Uninformative"
+ )
+ input$is_outlier <- ifelse(input$label == "H" & input$log2inty <= 0,
+ TRUE, input$is_outlier
+ )
+ input <- unique(input[, list(label, protein, feature, run, feature_quality, is_outlier)])
+ input
}
#' Add outlier cutoff
@@ -143,13 +155,14 @@ MSstatsSelectFeatures = function(input, method, top_n = 3, min_feature_count = 2
#' @param quantile_order quantile used to label outliers
#' @return data.table
#' @keywords internal
-.addOutlierCutoff = function(input, quantile_order = 0.01) {
- min_obs = NULL
-
- input[,
- min_obs := .calculateOutlierCutoff(.SD, quantile_order),
- by = "protein",
- .SDcols = c("run", "feature", "is_obs", "unrep")]
+.addOutlierCutoff <- function(input, quantile_order = 0.01) {
+ min_obs <- NULL
+
+ input[,
+ min_obs := .calculateOutlierCutoff(.SD, quantile_order),
+ by = "protein",
+ .SDcols = c("run", "feature", "is_obs", "unrep")
+ ]
}
@@ -158,14 +171,16 @@ MSstatsSelectFeatures = function(input, method, top_n = 3, min_feature_count = 2
#' @return numeric
#' @importFrom stats qbinom
#' @keywords internal
-.calculateOutlierCutoff = function(input, quantile_order = 0.01) {
- unrep = is_obs = feature = run = NULL
-
- n_runs = data.table::uniqueN(input[!(unrep) & (is_obs), run])
- n_features = data.table::uniqueN(input[!(unrep) & (is_obs), feature])
- n = input[!(unrep) & (is_obs), sum(is_obs, na.rm = TRUE)]
- qbinom(quantile_order, n_runs,
- n / (n_features * n_runs))
+.calculateOutlierCutoff <- function(input, quantile_order = 0.01) {
+ unrep <- is_obs <- feature <- run <- NULL
+
+ n_runs <- data.table::uniqueN(input[!(unrep) & (is_obs), run])
+ n_features <- data.table::uniqueN(input[!(unrep) & (is_obs), feature])
+ n <- input[!(unrep) & (is_obs), sum(is_obs, na.rm = TRUE)]
+ qbinom(
+ quantile_order, n_runs,
+ n / (n_features * n_runs)
+ )
}
@@ -173,14 +188,15 @@ MSstatsSelectFeatures = function(input, method, top_n = 3, min_feature_count = 2
#' @param input data.table
#' @return data.table
#' @keywords internal
-.addCoverageInfo = function(input) {
- is_lowcvr = unrep = has_three_informative = NULL
-
- input[(has_three_informative), is_lowcvr := .flagLowCoverage(.SD),
- by = c("protein", "feature"),
- .SDcols = c("is_obs", "min_obs")]
- input[, is_lowcvr := ifelse(unrep, TRUE, is_lowcvr)]
- input[, is_lowcvr := ifelse(is.na(is_lowcvr), FALSE, is_lowcvr)]
+.addCoverageInfo <- function(input) {
+ is_lowcvr <- unrep <- has_three_informative <- NULL
+
+ input[(has_three_informative), is_lowcvr := .flagLowCoverage(.SD),
+ by = c("protein", "feature"),
+ .SDcols = c("is_obs", "min_obs")
+ ]
+ input[, is_lowcvr := ifelse(unrep, TRUE, is_lowcvr)]
+ input[, is_lowcvr := ifelse(is.na(is_lowcvr), FALSE, is_lowcvr)]
}
@@ -188,8 +204,8 @@ MSstatsSelectFeatures = function(input, method, top_n = 3, min_feature_count = 2
#' @param input data.table
#' @return logical
#' @keywords internal
-.flagLowCoverage = function(input) {
- sum(input$is_obs, na.rm = TRUE) < unique(input$min_obs)
+.flagLowCoverage <- function(input) {
+ sum(input$is_obs, na.rm = TRUE) < unique(input$min_obs)
}
@@ -198,16 +214,18 @@ MSstatsSelectFeatures = function(input, method, top_n = 3, min_feature_count = 2
#' @param column name of a column used for filtering
#' @return data.table
#' @keywords internal
-.addNInformativeInfo = function(input, min_feature_count, column) {
- has_three_informative = n_informative = NULL
-
- input[, n_informative := .countInformative(.SD, column), by = "protein",
- .SDcols = c("feature", column)]
- if (is.element("has_three_informative", colnames(input))) {
- input[, has_three_informative := has_three_informative & n_informative > min_feature_count]
- } else {
- input[, has_three_informative := n_informative > min_feature_count]
- }
+.addNInformativeInfo <- function(input, min_feature_count, column) {
+ has_three_informative <- n_informative <- NULL
+
+ input[, n_informative := .countInformative(.SD, column),
+ by = "protein",
+ .SDcols = c("feature", column)
+ ]
+ if (is.element("has_three_informative", colnames(input))) {
+ input[, has_three_informative := has_three_informative & n_informative > min_feature_count]
+ } else {
+ input[, has_three_informative := n_informative > min_feature_count]
+ }
}
#' Count informative features
@@ -216,9 +234,9 @@ MSstatsSelectFeatures = function(input, method, top_n = 3, min_feature_count = 2
#' @return numeric
#' @keywords internal
#' @importFrom data.table uniqueN
-.countInformative = function(input, column) {
- feature = NULL
- data.table::uniqueN(input[!input[[column]], feature])
+.countInformative <- function(input, column) {
+ feature <- NULL
+ data.table::uniqueN(input[!input[[column]], feature])
}
@@ -226,15 +244,17 @@ MSstatsSelectFeatures = function(input, method, top_n = 3, min_feature_count = 2
#' @param input data.table
#' @return data.table
#' @keywords internal
-.addModelInformation = function(input) {
- has_three_informative = NULL
-
- input[(has_three_informative),
- c("model_residuals", "df_resid", "var_resid") := .calculateProteinVariance(.SD),
- by = "protein",
- .SDcols = c("protein", "log2inty", "run",
- "feature", "is_lowcvr", "unrep")]
-
+.addModelInformation <- function(input) {
+ has_three_informative <- NULL
+
+ input[(has_three_informative),
+ c("model_residuals", "df_resid", "var_resid") := .calculateProteinVariance(.SD),
+ by = "protein",
+ .SDcols = c(
+ "protein", "log2inty", "run",
+ "feature", "is_lowcvr", "unrep"
+ )
+ ]
}
@@ -243,23 +263,25 @@ MSstatsSelectFeatures = function(input, method, top_n = 3, min_feature_count = 2
#' @return list of residuals, degress of freedom and variances
#' @importFrom stats residuals
#' @keywords internal
-.calculateProteinVariance = function(input) {
- is_lowcvr = unrep = NULL
-
- robust_model = try(.fitHuber(input[!(is_lowcvr) & !(unrep), ]), silent = TRUE)
- if (inherits(robust_model, "try-error")) {
- list(NA_real_, NA_real_, NA_real_)
+.calculateProteinVariance <- function(input) {
+ is_lowcvr <- unrep <- NULL
+
+ robust_model <- try(.fitHuber(input[!(is_lowcvr) & !(unrep), ]), silent = TRUE)
+ if (inherits(robust_model, "try-error")) {
+ list(NA_real_, NA_real_, NA_real_)
+ } else {
+ if (robust_model$converged) {
+ model_residuals <- rep(NA_real_, nrow(input))
+ model_residuals[!input$is_lowcvr & !is.na(input$log2inty) & !input$unrep] <- residuals(robust_model)
+ list(
+ as.numeric(model_residuals),
+ rep(summary(robust_model)$df[2], nrow(input)),
+ rep(summary(robust_model)$sigma^2, nrow(input))
+ )
} else {
- if (robust_model$converged) {
- model_residuals = rep(NA_real_, nrow(input))
- model_residuals[!input$is_lowcvr & !is.na(input$log2inty) & !input$unrep] = residuals(robust_model)
- list(as.numeric(model_residuals),
- rep(summary(robust_model)$df[2], nrow(input)),
- rep(summary(robust_model)$sigma ^ 2, nrow(input)))
- } else {
- list(NA_real_, NA_real_, NA_real_)
- }
+ list(NA_real_, NA_real_, NA_real_)
}
+ }
}
@@ -267,8 +289,8 @@ MSstatsSelectFeatures = function(input, method, top_n = 3, min_feature_count = 2
#' @importFrom MASS rlm
#' @return rlm
#' @keywords internal
-.fitHuber = function(input) {
- MASS::rlm(log2inty ~ run + feature, data = input, scale.est = "Huber")
+.fitHuber <- function(input) {
+ MASS::rlm(log2inty ~ run + feature, data = input, scale.est = "Huber")
}
@@ -277,23 +299,24 @@ MSstatsSelectFeatures = function(input, method, top_n = 3, min_feature_count = 2
#' @keywords internal
#' @return data.table
#' @importFrom limma squeezeVar
-.addModelVariances = function(input) {
- protein = df_resid = var_resid = s_resid_eb = NULL
-
- model_variances = unique(input[, list(protein, df_resid, var_resid)])
- model_variances = model_variances[!is.na(df_resid), ]
-
- if (nrow(model_variances) > 0) {
- eb_fit = limma::squeezeVar(model_variances$var_resid, model_variances$df_resid,
- robust = TRUE)
- model_variances$var_resid_eb = eb_fit$var.post
- model_variances$s_resid_eb = sqrt(eb_fit$var.post)
- model_variances = model_variances[, list(protein, s_resid_eb)]
- input = merge(input, model_variances, by = "protein", all.x = TRUE)
- } else {
- input$s_resid_eb = NA_real_
- }
- input
+.addModelVariances <- function(input) {
+ protein <- df_resid <- var_resid <- s_resid_eb <- NULL
+
+ model_variances <- unique(input[, list(protein, df_resid, var_resid)])
+ model_variances <- model_variances[!is.na(df_resid), ]
+
+ if (nrow(model_variances) > 0) {
+ eb_fit <- limma::squeezeVar(model_variances$var_resid, model_variances$df_resid,
+ robust = TRUE
+ )
+ model_variances$var_resid_eb <- eb_fit$var.post
+ model_variances$s_resid_eb <- sqrt(eb_fit$var.post)
+ model_variances <- model_variances[, list(protein, s_resid_eb)]
+ input <- merge(input, model_variances, by = "protein", all.x = TRUE)
+ } else {
+ input$s_resid_eb <- NA_real_
+ }
+ input
}
@@ -301,38 +324,42 @@ MSstatsSelectFeatures = function(input, method, top_n = 3, min_feature_count = 2
#' @param input data.table
#' @return data.table
#' @keywords internal
-.addNoisyFlag = function(input) {
- svar_feature = is_outlier = is_noisy = NULL
-
- feature_vars = .getFeatureVariances(input)
- if (nrow(feature_vars) > 0) {
- input = merge(input, feature_vars,
- by = c("protein", "feature"),
- all.x = TRUE)
- if (unique(input$label) == "H") {
- input[, is_outlier := FALSE]
- } else {
- input[, is_outlier := .addOutlierInformation(.SD), by = "feature"]
- }
- input[, is_noisy := svar_feature > .getQuantileCutoff(.SD),
- .SDcols = c("feature", "svar_ref")]
- input[, is_noisy := ifelse(is.na(is_noisy), FALSE, is_noisy)]
- input
+.addNoisyFlag <- function(input) {
+ svar_feature <- is_outlier <- is_noisy <- NULL
+
+ feature_vars <- .getFeatureVariances(input)
+ if (nrow(feature_vars) > 0) {
+ input <- merge(input, feature_vars,
+ by = c("protein", "feature"),
+ all.x = TRUE
+ )
+ if (unique(input$label) == "H") {
+ input[, is_outlier := FALSE]
} else {
- input[, is_outlier := NA]
- input[, is_noisy := NA]
- input
+ input[, is_outlier := .addOutlierInformation(.SD), by = "feature"]
}
+ input[, is_noisy := svar_feature > .getQuantileCutoff(.SD),
+ .SDcols = c("feature", "svar_ref")
+ ]
+ input[, is_noisy := ifelse(is.na(is_noisy), FALSE, is_noisy)]
+ input
+ } else {
+ input[, is_outlier := NA]
+ input[, is_noisy := NA]
+ input
+ }
}
#' @importFrom stats quantile
#' @keywords internal
-.getQuantileCutoff = function(input) {
- feature = svar_ref = NULL
-
- quantile(unique(input[, list(feature, svar_ref)])[, svar_ref],
- 0.05, na.rm = TRUE)
+.getQuantileCutoff <- function(input) {
+ feature <- svar_ref <- NULL
+
+ quantile(unique(input[, list(feature, svar_ref)])[, svar_ref],
+ 0.05,
+ na.rm = TRUE
+ )
}
@@ -341,25 +368,27 @@ MSstatsSelectFeatures = function(input, method, top_n = 3, min_feature_count = 2
#' @param tolerance cutoff for outliers
#' @return numeric
#' @keywords internal
-.getFeatureVariances = function(input, tolerance = 3) {
- s_resid_eb = num_filter = model_residuals = unrep = is_lowcvr = NULL
- log2inty = n_runs = resid_null = NULL
-
- remove_outliers = unique(input$label) == "L"
- if (remove_outliers) {
- input[, num_filter := abs(model_residuals / s_resid_eb) > tolerance]
- } else {
- input[, num_filter := rep(FALSE, .N)]
- }
- input = input[!(num_filter) & !is.na(model_residuals) & !unrep & !is_lowcvr]
- input[, resid_null := log2inty - mean(log2inty, na.rm = TRUE), by = "protein"]
- input[, n_runs := .N, by = "feature"]
-
- sums_of_squares = input[, list(
- svar_feature = sum(model_residuals ^ 2, na.rm = TRUE) / (n_runs - 1) / unique(s_resid_eb ^ 2),
- svar_ref = sum(resid_null ^ 2, na.rm = TRUE) / (n_runs - 1) / unique(s_resid_eb) ^ 2),
- by = c("protein", "feature")]
- unique(sums_of_squares)
+.getFeatureVariances <- function(input, tolerance = 3) {
+ s_resid_eb <- num_filter <- model_residuals <- unrep <- is_lowcvr <- NULL
+ log2inty <- n_runs <- resid_null <- NULL
+
+ remove_outliers <- unique(input$label) == "L"
+ if (remove_outliers) {
+ input[, num_filter := abs(model_residuals / s_resid_eb) > tolerance]
+ } else {
+ input[, num_filter := rep(FALSE, .N)]
+ }
+ input <- input[!(num_filter) & !is.na(model_residuals) & !unrep & !is_lowcvr]
+ input[, resid_null := log2inty - mean(log2inty, na.rm = TRUE), by = "protein"]
+ input[, n_runs := .N, by = "feature"]
+
+ sums_of_squares <- input[, list(
+ svar_feature = sum(model_residuals^2, na.rm = TRUE) / (n_runs - 1) / unique(s_resid_eb^2),
+ svar_ref = sum(resid_null^2, na.rm = TRUE) / (n_runs - 1) / unique(s_resid_eb)^2
+ ),
+ by = c("protein", "feature")
+ ]
+ unique(sums_of_squares)
}
@@ -369,16 +398,16 @@ MSstatsSelectFeatures = function(input, method, top_n = 3, min_feature_count = 2
#' @param keep_run if TRUE, completely missing runs will be kept
#' @return logical
#' @keywords internal
-.addOutlierInformation = function(input, tol = 3, keep_run = FALSE) {
- result = all_missing = NULL
-
- input$result = abs(input$model_residuals / input$s_resid_eb) > tol
- if (keep_run) {
- input[, all_missing := all(result | is.na(result)), by = "run"]
- input[, result := ifelse(all_missing, FALSE, result)]
- }
- if (is.na(unique(input$s_resid_eb))) {
- input$result = rep(FALSE, nrow(input))
- }
- input$result
+.addOutlierInformation <- function(input, tol = 3, keep_run = FALSE) {
+ result <- all_missing <- NULL
+
+ input$result <- abs(input$model_residuals / input$s_resid_eb) > tol
+ if (keep_run) {
+ input[, all_missing := all(result | is.na(result)), by = "run"]
+ input[, result := ifelse(all_missing, FALSE, result)]
+ }
+ if (is.na(unique(input$s_resid_eb))) {
+ input$result <- rep(FALSE, nrow(input))
+ }
+ input$result
}
diff --git a/R/utils_groupcomparison.R b/R/utils_groupcomparison.R
index cf008fcd..9182abb6 100644
--- a/R/utils_groupcomparison.R
+++ b/R/utils_groupcomparison.R
@@ -1,75 +1,78 @@
#' Check if data represents repeated measurements design
-#'
+#'
#' @param summarization_output output of the dataProcess function
-#'
+#'
#' @return logical, TRUE if data represent repeated measurements design
-#'
+#'
#' @details This extracts information required by the group comparison workflow
-#'
+#'
#' @export
-#'
+#'
#' @examples
#' QuantData1 <- dataProcess(SRMRawData, use_log_file = FALSE)
#' checkRepeatedDesign(QuantData1)
-#'
-checkRepeatedDesign = function(summarization_output) {
- SUBJECT = GROUP = NULL
-
- input = as.data.table(summarization_output$ProteinLevelData)
- subject_by_group = table(input[, list(SUBJECT, GROUP)])
- subject_appearances = apply(subject_by_group, 1, function(x) sum(x >
- 0))
- repeated = any(subject_appearances > 1)
- if (repeated) {
- msg = "Time course design of experiment - okay"
- }
- else {
- msg = "Case control design of experiment - okay"
- }
- getOption("MSstatsLog")("INFO", msg)
- repeated
+#'
+checkRepeatedDesign <- function(summarization_output) {
+ SUBJECT <- GROUP <- NULL
+
+ input <- as.data.table(summarization_output$ProteinLevelData)
+ subject_by_group <- table(input[, list(SUBJECT, GROUP)])
+ subject_appearances <- apply(subject_by_group, 1, function(x) {
+ sum(x >
+ 0)
+ })
+ repeated <- any(subject_appearances > 1)
+ if (repeated) {
+ msg <- "Time course design of experiment - okay"
+ } else {
+ msg <- "Case control design of experiment - okay"
+ }
+ getOption("MSstatsLog")("INFO", msg)
+ repeated
}
#' Get information about number of measurements for each group
-#'
+#'
#' @param summarization_output output of the dataProcess function
-#'
+#'
#' @return data.table
-#'
+#'
#' @details This function extracts information required to compute percentages
#' of missing and imputed values in group comparison.
-#'
+#'
#' @export
-#'
+#'
#' @examples
#' QuantData <- dataProcess(DDARawData, use_log_file = FALSE)
#' samples_info <- getSamplesInfo(QuantData)
#' samples_info
-#'
-getSamplesInfo = function(summarization_output) {
- RUN = NULL
-
- summarized = as.data.table(summarization_output$ProteinLevelData)
- summarized[, list(NumRuns = data.table::uniqueN(RUN)),
- by = "GROUP"]
+#'
+getSamplesInfo <- function(summarization_output) {
+ RUN <- NULL
+
+ summarized <- as.data.table(summarization_output$ProteinLevelData)
+ summarized[, list(NumRuns = data.table::uniqueN(RUN)),
+ by = "GROUP"
+ ]
}
#' Prepare data for a single protein for group comparison
#' @param single_protein data.table
#' @keywords internal
-.prepareSingleProteinForGC = function(single_protein) {
- ABUNDANCE = GROUP = SUBJECT = RUN = NULL
-
- data.table::setnames(single_protein,
- c("LogIntensities"),
- c("ABUNDANCE"),
- skip_absent = TRUE)
- single_protein = single_protein[!is.na(ABUNDANCE)]
- single_protein[, GROUP := factor(GROUP)]
- single_protein[, SUBJECT := factor(SUBJECT)]
- single_protein[, RUN := factor(RUN)]
- single_protein
+.prepareSingleProteinForGC <- function(single_protein) {
+ ABUNDANCE <- GROUP <- SUBJECT <- RUN <- NULL
+
+ data.table::setnames(single_protein,
+ c("LogIntensities"),
+ c("ABUNDANCE"),
+ skip_absent = TRUE
+ )
+ single_protein <- single_protein[!is.na(ABUNDANCE)]
+ single_protein[, GROUP := factor(GROUP)]
+ single_protein[, SUBJECT := factor(SUBJECT)]
+ single_protein[, RUN := factor(RUN)]
+ single_protein
}
@@ -86,104 +89,118 @@ getSamplesInfo = function(summarization_output) {
#' @param has_imputed if TRUE, missing values have been imputed by dataProcess
#' @importFrom stats resid fitted
#' @keywords internal
-.fitModelSingleProtein = function(input, contrast_matrix, has_tech_replicates,
- is_single_subject, repeated, groups,
- samples_info,
- save_fitted_models, has_imputed) {
- GROUP = SUBJECT = NULL
-
- input[, GROUP := factor(GROUP)]
- input[, SUBJECT := factor(SUBJECT)]
-
- protein = unique(input$Protein)
- n_groups = nlevels(input$GROUP)
- if (n_groups == 1) {
- result = .getEmptyComparison(input, contrast_matrix,
- groups, protein)
- residuals = NA
- fitted_values = NA
- fit = NULL
+.fitModelSingleProtein <- function(input, contrast_matrix, has_tech_replicates,
+ is_single_subject, repeated, groups,
+ samples_info,
+ save_fitted_models, has_imputed) {
+ GROUP <- SUBJECT <- NULL
+
+ input[, GROUP := factor(GROUP)]
+ input[, SUBJECT := factor(SUBJECT)]
+
+ protein <- unique(input$Protein)
+ n_groups <- nlevels(input$GROUP)
+ if (n_groups == 1) {
+ result <- .getEmptyComparison(
+ input, contrast_matrix,
+ groups, protein
+ )
+ residuals <- NA
+ fitted_values <- NA
+ fit <- NULL
+ } else {
+ fitted_model <- .fitModelForGroupComparison(
+ input, repeated,
+ is_single_subject,
+ has_tech_replicates
+ )
+ result <- .getAllComparisons(
+ input, fitted_model, contrast_matrix,
+ groups, protein
+ )
+ result <- .countMissingPercentage(
+ contrast_matrix,
+ input, result, samples_info,
+ has_imputed
+ )
+
+ if (inherits(fitted_model[["full_fit"]], "lm")) {
+ residuals <- fitted_model[["full_fit"]][["residuals"]]
+ fitted_values <- fitted_model[["full_fit"]][["fitted.values"]]
} else {
- fitted_model = .fitModelForGroupComparison(input, repeated,
- is_single_subject,
- has_tech_replicates)
- result = .getAllComparisons(input, fitted_model, contrast_matrix,
- groups, protein)
- result = .countMissingPercentage(contrast_matrix,
- input, result, samples_info,
- has_imputed)
-
- if (inherits(fitted_model[["full_fit"]], "lm")) {
- residuals = fitted_model[["full_fit"]][["residuals"]]
- fitted_values = fitted_model[["full_fit"]][["fitted.values"]]
- } else {
- residuals = resid(fitted_model[["full_fit"]])
- fitted_values = fitted(fitted_model[["full_fit"]])
- }
- fit = fitted_model[["full_fit"]]
- }
-
- if (!save_fitted_models) {
- fit = NULL
+ residuals <- resid(fitted_model[["full_fit"]])
+ fitted_values <- fitted(fitted_model[["full_fit"]])
}
-
- input[, residuals := residuals]
- input[, fitted := fitted_values]
- list(result, fit)
+ fit <- fitted_model[["full_fit"]]
+ }
+
+ if (!save_fitted_models) {
+ fit <- NULL
+ }
+
+ input[, residuals := residuals]
+ input[, fitted := fitted_values]
+ list(result, fit)
}
#' Choose a model type (fixed/mixed effects) and fit it for a single protein
#' @inheritParams .fitModelSingleProtein
#' @keywords internal
-.fitModelForGroupComparison = function(input, repeated, is_single_subject,
- has_tech_replicates) {
- if (!repeated) {
- if (!has_tech_replicates | is_single_subject) {
- full_fit = lm(ABUNDANCE ~ GROUP, data = input)
- df_full = full_fit[["df.residual"]]
- } else {
- full_fit = suppressMessages(try(
- lme4::lmer(ABUNDANCE ~ GROUP + (1|SUBJECT), data = input),
- TRUE
- ))
- df_full = suppressMessages(try(
- lm(ABUNDANCE ~ GROUP + SUBJECT, data = input)$df.residual,
- TRUE
- ))
- }
+.fitModelForGroupComparison <- function(input, repeated, is_single_subject,
+ has_tech_replicates) {
+ if (!repeated) {
+ if (!has_tech_replicates | is_single_subject) {
+ full_fit <- lm(ABUNDANCE ~ GROUP, data = input)
+ df_full <- full_fit[["df.residual"]]
} else {
- ## time-course
- if (is_single_subject) {
- full_fit = lm(ABUNDANCE ~ GROUP,
- data = input)
- df_full = full_fit$df.residual
- } else {
- ## no single subject
- if (!has_tech_replicates) {
- full_fit = suppressMessages(try(
- lme4::lmer(ABUNDANCE ~ GROUP + (1|SUBJECT), data = input),
- TRUE
- ))
- df_full = suppressMessages(try(
- lm(ABUNDANCE ~ GROUP + SUBJECT, data = input)$df.residual,
- TRUE))
- } else {
- full_fit = suppressMessages(try(
- lme4::lmer(ABUNDANCE ~ GROUP + (1|SUBJECT) + (1|GROUP:SUBJECT),
- data = input),
- TRUE
- ))
- df_full = suppressMessages(try(
- lm(ABUNDANCE ~ GROUP + SUBJECT + GROUP:SUBJECT,
- data = input)$df.residual,
- TRUE
- ))
- }
- }
+ full_fit <- suppressMessages(try(
+ lme4::lmer(ABUNDANCE ~ GROUP + (1 | SUBJECT), data = input),
+ TRUE
+ ))
+ df_full <- suppressMessages(try(
+ lm(ABUNDANCE ~ GROUP + SUBJECT, data = input)$df.residual,
+ TRUE
+ ))
}
- list(full_fit = full_fit,
- df_full = df_full)
+ } else {
+ ## time-course
+ if (is_single_subject) {
+ full_fit <- lm(ABUNDANCE ~ GROUP,
+ data = input
+ )
+ df_full <- full_fit$df.residual
+ } else {
+ ## no single subject
+ if (!has_tech_replicates) {
+ full_fit <- suppressMessages(try(
+ lme4::lmer(ABUNDANCE ~ GROUP + (1 | SUBJECT), data = input),
+ TRUE
+ ))
+ df_full <- suppressMessages(try(
+ lm(ABUNDANCE ~ GROUP + SUBJECT, data = input)$df.residual,
+ TRUE
+ ))
+ } else {
+ full_fit <- suppressMessages(try(
+ lme4::lmer(ABUNDANCE ~ GROUP + (1 | SUBJECT) + (1 | GROUP:SUBJECT),
+ data = input
+ ),
+ TRUE
+ ))
+ df_full <- suppressMessages(try(
+ lm(ABUNDANCE ~ GROUP + SUBJECT + GROUP:SUBJECT,
+ data = input
+ )$df.residual,
+ TRUE
+ ))
+ }
+ }
+ }
+ list(
+ full_fit = full_fit,
+ df_full = df_full
+ )
}
@@ -192,16 +209,16 @@ getSamplesInfo = function(summarization_output) {
#' @importFrom stats vcov
#' @importFrom methods is
#' @keywords internal
-.getModelParameters = function(fitted_model) {
- if (is(fitted_model[["full_fit"]], "lm")) {
- model_summary = summary(fitted_model[["full_fit"]])
- cf = model_summary[["coefficients"]]
- vcv = model_summary[["cov.unscaled"]] * (model_summary[["sigma"]] ^ 2)
- } else {
- cf = as.matrix(lme4::fixef(fitted_model[["full_fit"]]))
- vcv = as.matrix(vcov(fitted_model[["full_fit"]]))
- }
- list(cf = cf, vcv = vcv, df = fitted_model[["df_full"]])
+.getModelParameters <- function(fitted_model) {
+ if (is(fitted_model[["full_fit"]], "lm")) {
+ model_summary <- summary(fitted_model[["full_fit"]])
+ cf <- model_summary[["coefficients"]]
+ vcv <- model_summary[["cov.unscaled"]] * (model_summary[["sigma"]]^2)
+ } else {
+ cf <- as.matrix(lme4::fixef(fitted_model[["full_fit"]]))
+ vcv <- as.matrix(vcov(fitted_model[["full_fit"]]))
+ }
+ list(cf = cf, vcv = vcv, df = fitted_model[["df_full"]])
}
@@ -211,49 +228,57 @@ getSamplesInfo = function(summarization_output) {
#' @param groups unique labels of experimental conditions
#' @param protein name of a protein
#' @keywords internal
-.getEmptyComparison = function(input, contrast_matrix, groups, protein) {
- all_comparisons = lapply(seq_len(nrow(contrast_matrix)), function(row_id) {
- ith_comparison = contrast_matrix[row_id, , drop = FALSE]
-
- if (any(groups[ith_comparison != 0] %in% unique(input$GROUP))) {
- msg = paste("*** error: results of protein", protein,
- "for comparison", row.names(ith_comparison),
- "are NA because there are measurements",
- "only in a single group")
- getOption("MSstatsLog")("INFO", msg)
-
- if (ith_comparison[ith_comparison != 0 &
- (groups %in% unique(input$GROUP))] > 0) {
- list(
- logFC = Inf,
- issue = "oneConditionMissing"
- )
- } else {
- list(
- logFC = -Inf,
- issue = "oneConditionMissing"
- )
- }
- } else {
- msg = paste("*** error: results of protein", protein,
- "for comparison", row.names(ith_comparison),
- "are NA because there are no measurements",
- "in both conditions.")
- getOption("MSstatsLog")("INFO", msg)
-
- list(logFC = NA,
- issue = "completeMissing")
- }
- })
- empty_result = data.table::rbindlist(all_comparisons, fill = TRUE)
- empty_result = cbind(empty_result,
- data.table::data.table(
- Protein = protein,
- Label = row.names(contrast_matrix),
- SE = NA, Tvalue = NA,
- DF = NA, pvalue = NA
- ))
- empty_result
+.getEmptyComparison <- function(input, contrast_matrix, groups, protein) {
+ all_comparisons <- lapply(seq_len(nrow(contrast_matrix)), function(row_id) {
+ ith_comparison <- contrast_matrix[row_id, , drop = FALSE]
+
+ if (any(groups[ith_comparison != 0] %in% unique(input$GROUP))) {
+ msg <- paste(
+ "*** error: results of protein", protein,
+ "for comparison", row.names(ith_comparison),
+ "are NA because there are measurements",
+ "only in a single group"
+ )
+ getOption("MSstatsLog")("INFO", msg)
+
+ if (ith_comparison[ith_comparison != 0 &
+ (groups %in% unique(input$GROUP))] > 0) {
+ list(
+ logFC = Inf,
+ issue = "oneConditionMissing"
+ )
+ } else {
+ list(
+ logFC = -Inf,
+ issue = "oneConditionMissing"
+ )
+ }
+ } else {
+ msg <- paste(
+ "*** error: results of protein", protein,
+ "for comparison", row.names(ith_comparison),
+ "are NA because there are no measurements",
+ "in both conditions."
+ )
+ getOption("MSstatsLog")("INFO", msg)
+
+ list(
+ logFC = NA,
+ issue = "completeMissing"
+ )
+ }
+ })
+ empty_result <- data.table::rbindlist(all_comparisons, fill = TRUE)
+ empty_result <- cbind(
+ empty_result,
+ data.table::data.table(
+ Protein = protein,
+ Label = row.names(contrast_matrix),
+ SE = NA, Tvalue = NA,
+ DF = NA, pvalue = NA
+ )
+ )
+ empty_result
}
@@ -264,27 +289,31 @@ getSamplesInfo = function(summarization_output) {
#' @param groups unique labels of experimental conditions
#' @param protein name of a protein
#' @keywords internal
-.getAllComparisons = function(input, fitted_model, contrast_matrix,
- groups, protein) {
- empty_conditions = setdiff(groups, unique(input$GROUP))
- parameters = .getModelParameters(fitted_model)
- fit = fitted_model[["full_fit"]]
- coefs = parameters$cf[, 1]
-
- all_comparisons = vector("list", nrow(contrast_matrix))
- for (row_id in seq_len(nrow(contrast_matrix))) {
- ith_contrast = contrast_matrix[row_id, , drop = FALSE]
- if (length(empty_conditions) != 0) {
- result = .handleEmptyConditions(input, fit, ith_contrast,
- groups, parameters, protein,
- empty_conditions, coefs)
- } else {
- result = .handleSingleContrast(input, fit, ith_contrast, groups,
- parameters, protein, coefs)
- }
- all_comparisons[[row_id]] = result
+.getAllComparisons <- function(input, fitted_model, contrast_matrix,
+ groups, protein) {
+ empty_conditions <- setdiff(groups, unique(input$GROUP))
+ parameters <- .getModelParameters(fitted_model)
+ fit <- fitted_model[["full_fit"]]
+ coefs <- parameters$cf[, 1]
+
+ all_comparisons <- vector("list", nrow(contrast_matrix))
+ for (row_id in seq_len(nrow(contrast_matrix))) {
+ ith_contrast <- contrast_matrix[row_id, , drop = FALSE]
+ if (length(empty_conditions) != 0) {
+ result <- .handleEmptyConditions(
+ input, fit, ith_contrast,
+ groups, parameters, protein,
+ empty_conditions, coefs
+ )
+ } else {
+ result <- .handleSingleContrast(
+ input, fit, ith_contrast, groups,
+ parameters, protein, coefs
+ )
}
- data.table::rbindlist(all_comparisons, fill = TRUE)
+ all_comparisons[[row_id]] <- result
+ }
+ data.table::rbindlist(all_comparisons, fill = TRUE)
}
@@ -292,68 +321,78 @@ getSamplesInfo = function(summarization_output) {
#' @param input summarized data
#' @param contrast single row of a contrast matrix
#' @param groups unique labels of experimental conditions
-#' @param parameters parameters extracted from the model
+#' @param parameters parameters extracted from the model
#' @param protein name of a protein
#' @param empty_conditions labels of empty conditions
#' @param coefs coefficient of the fitted model
#' @keywords internal
-.handleEmptyConditions = function(input, fit, contrast,
- groups, parameters, protein,
- empty_conditions, coefs) {
- count_diff_pos = intersect(colnames(contrast)[contrast != 0 & contrast > 0],
- empty_conditions)
- count_diff_neg = intersect(colnames(contrast)[contrast != 0 & contrast < 0],
- empty_conditions)
- flag_issue_pos = length(count_diff_pos) != 0
- flag_issue_neg = length(count_diff_neg) != 0
-
- if (any(c(flag_issue_pos, flag_issue_neg))) {
- if (flag_issue_pos & flag_issue_neg) {
- issue = "completeMissing"
- logFC = NA
- } else if (flag_issue_pos & !flag_issue_neg) {
- issue_side = count_diff_pos
- logFC = -Inf
- issue = "oneConditionMissing"
- } else if (!flag_issue_pos & flag_issue_neg) {
- issue_side = count_diff_neg
- logFC = Inf
- issue = "oneConditionMissing"
- }
- result = list(Protein = protein,
- logFC = logFC,
- Label = row.names(contrast),
- SE = NA, Tvalue = NA, DF = NA, pvalue = NA,
- issue = issue)
- } else {
- result = .handleSingleContrast(input, fit, contrast, groups,
- parameters, protein, coefs)
+.handleEmptyConditions <- function(input, fit, contrast,
+ groups, parameters, protein,
+ empty_conditions, coefs) {
+ count_diff_pos <- intersect(
+ colnames(contrast)[contrast != 0 & contrast > 0],
+ empty_conditions
+ )
+ count_diff_neg <- intersect(
+ colnames(contrast)[contrast != 0 & contrast < 0],
+ empty_conditions
+ )
+ flag_issue_pos <- length(count_diff_pos) != 0
+ flag_issue_neg <- length(count_diff_neg) != 0
+
+ if (any(c(flag_issue_pos, flag_issue_neg))) {
+ if (flag_issue_pos & flag_issue_neg) {
+ issue <- "completeMissing"
+ logFC <- NA
+ } else if (flag_issue_pos & !flag_issue_neg) {
+ issue_side <- count_diff_pos
+ logFC <- -Inf
+ issue <- "oneConditionMissing"
+ } else if (!flag_issue_pos & flag_issue_neg) {
+ issue_side <- count_diff_neg
+ logFC <- Inf
+ issue <- "oneConditionMissing"
}
- result
+ result <- list(
+ Protein = protein,
+ logFC = logFC,
+ Label = row.names(contrast),
+ SE = NA, Tvalue = NA, DF = NA, pvalue = NA,
+ issue = issue
+ )
+ } else {
+ result <- .handleSingleContrast(
+ input, fit, contrast, groups,
+ parameters, protein, coefs
+ )
+ }
+ result
}
#' Group comparison for a single contrast
#' @inheritParams .handleEmptyConditions
#' @keywords internal
-.handleSingleContrast = function(input, fit, contrast, groups,
- parameters, protein, coefs) {
- groups = sort(groups)
- contrast_values = .getContrast(input, contrast, coefs, groups)
- parameters$cf = parameters$cf[names(contrast_values), , drop = FALSE]
- parameters$vcv = parameters$vcv[names(contrast_values), names(contrast_values)]
- result = get_estimable_fixed_random(parameters, contrast_values)
- if (is.null(result)) {
- result = list(Protein = protein,
- Label = row.names(contrast),
- logFC = NA, SE = NA, Tvalue = NA,
- DF = NA, pvalue = NA, issue = NA)
- } else {
- result$Protein = protein
- result$Label = row.names(contrast)
- result$issue = NA
- }
- result
+.handleSingleContrast <- function(input, fit, contrast, groups,
+ parameters, protein, coefs) {
+ groups <- sort(groups)
+ contrast_values <- .getContrast(input, contrast, coefs, groups)
+ parameters$cf <- parameters$cf[names(contrast_values), , drop = FALSE]
+ parameters$vcv <- parameters$vcv[names(contrast_values), names(contrast_values)]
+ result <- get_estimable_fixed_random(parameters, contrast_values)
+ if (is.null(result)) {
+ result <- list(
+ Protein = protein,
+ Label = row.names(contrast),
+ logFC = NA, SE = NA, Tvalue = NA,
+ DF = NA, pvalue = NA, issue = NA
+ )
+ } else {
+ result$Protein <- protein
+ result$Label <- row.names(contrast)
+ result$issue <- NA
+ }
+ result
}
@@ -363,27 +402,27 @@ getSamplesInfo = function(summarization_output) {
#' @param coefs coefficients of a linear model (named vector)
#' @param groups unique group labels
#' @keywords internal
-.getContrast = function(input, contrast, coefs, groups) {
- coef_names = names(coefs)
- intercept = grep("Intercept", coef_names, value = TRUE)
- if (length(intercept) > 0) {
- intercept_term = rep(0, length(intercept))
- names(intercept_term) = intercept
- } else {
- intercept_term = NULL
- }
- group = grep("GROUP", coef_names, value = TRUE)
- interaction = grep(":", coef_names, value = TRUE)
- group = setdiff(group, interaction)
- if (length(group) > 0) {
- group_term = contrast[, as.character(groups[groups %in% unique(input$GROUP)])]
- names(group_term) = paste0("GROUP", names(group_term))
- group_term = group_term[-1]
- } else {
- group_term = NULL
- }
- contrast = c(intercept_term, group_term)
- contrast[!is.na(coefs)]
+.getContrast <- function(input, contrast, coefs, groups) {
+ coef_names <- names(coefs)
+ intercept <- grep("Intercept", coef_names, value = TRUE)
+ if (length(intercept) > 0) {
+ intercept_term <- rep(0, length(intercept))
+ names(intercept_term) <- intercept
+ } else {
+ intercept_term <- NULL
+ }
+ group <- grep("GROUP", coef_names, value = TRUE)
+ interaction <- grep(":", coef_names, value = TRUE)
+ group <- setdiff(group, interaction)
+ if (length(group) > 0) {
+ group_term <- contrast[, as.character(groups[groups %in% unique(input$GROUP)])]
+ names(group_term) <- paste0("GROUP", names(group_term))
+ group_term <- group_term[-1]
+ } else {
+ group_term <- NULL
+ }
+ contrast <- c(intercept_term, group_term)
+ contrast[!is.na(coefs)]
}
@@ -394,49 +433,58 @@ getSamplesInfo = function(summarization_output) {
#' @param samples_info number of runs per group
#' @param has_imputed if TRUE, missing values have been imputed by dataProcess
#' @keywords internal
-.countMissingPercentage = function(contrast_matrix, summarized,
- result, samples_info, has_imputed) {
- TotalGroupMeasurements = NumMeasuredFeature = NumImputedFeature = NULL
- NumFea = NumRuns = totalN = NULL
-
- counts = summarized[,
- list(totalN = unique(TotalGroupMeasurements),
- NumMeasuredFeature = sum(NumMeasuredFeature,
- na.rm = TRUE),
- NumImputedFeature = sum(NumImputedFeature,
- na.rm = TRUE)),
- by = "GROUP"]
-
- empty_conditions = setdiff(samples_info$GROUP, unique(counts$GROUP))
- if (length(empty_conditions) !=0) {
- counts = merge(samples_info, counts, by = "GROUP", all.x = TRUE)
- counts[, NumFea := totalN / NumRuns ] # calculate number of features, to get the expected number of measurements in missing conditions
- nofea = max(ceiling(counts$NumFea), na.rm = TRUE) # it should be integer, just in case of double, use ceiling to get interger
- counts[is.na(totalN), NumMeasuredFeature := 0]
- counts[is.na(totalN), NumImputedFeature := 0]
- counts[is.na(totalN), totalN := NumRuns * nofea]
- }
-
- missing_vector = numeric(nrow(contrast_matrix))
- imputed_vector = numeric(nrow(contrast_matrix))
- for (i in seq_len(nrow(contrast_matrix))) {
- conditions = contrast_matrix[i, ] != 0
- missing_percentage = 1 - sum(counts$NumMeasuredFeature[conditions],
- na.rm = TRUE) / sum(counts$totalN[conditions],
- na.rm = TRUE)
- if (has_imputed) {
- imputed_percentage = sum(counts$NumImputedFeature[conditions],
- na.rm = TRUE) / sum(counts$totalN[conditions],
- na.rm = TRUE)
- imputed_vector[i] = imputed_percentage
- }
- missing_vector[i] = missing_percentage
- }
- result$MissingPercentage = missing_vector
+.countMissingPercentage <- function(contrast_matrix, summarized,
+ result, samples_info, has_imputed) {
+ TotalGroupMeasurements <- NumMeasuredFeature <- NumImputedFeature <- NULL
+ NumFea <- NumRuns <- totalN <- NULL
+
+ counts <- summarized[,
+ list(
+ totalN = unique(TotalGroupMeasurements),
+ NumMeasuredFeature = sum(NumMeasuredFeature,
+ na.rm = TRUE
+ ),
+ NumImputedFeature = sum(NumImputedFeature,
+ na.rm = TRUE
+ )
+ ),
+ by = "GROUP"
+ ]
+
+ empty_conditions <- setdiff(samples_info$GROUP, unique(counts$GROUP))
+ if (length(empty_conditions) != 0) {
+ counts <- merge(samples_info, counts, by = "GROUP", all.x = TRUE)
+ counts[, NumFea := totalN / NumRuns] # calculate number of features, to get the expected number of measurements in missing conditions
+ nofea <- max(ceiling(counts$NumFea), na.rm = TRUE) # it should be integer, just in case of double, use ceiling to get interger
+ counts[is.na(totalN), NumMeasuredFeature := 0]
+ counts[is.na(totalN), NumImputedFeature := 0]
+ counts[is.na(totalN), totalN := NumRuns * nofea]
+ }
+
+ missing_vector <- numeric(nrow(contrast_matrix))
+ imputed_vector <- numeric(nrow(contrast_matrix))
+ for (i in seq_len(nrow(contrast_matrix))) {
+ conditions <- contrast_matrix[i, ] != 0
+ missing_percentage <- 1 - sum(counts$NumMeasuredFeature[conditions],
+ na.rm = TRUE
+ ) / sum(counts$totalN[conditions],
+ na.rm = TRUE
+ )
if (has_imputed) {
- result$ImputationPercentage = imputed_vector
+ imputed_percentage <- sum(counts$NumImputedFeature[conditions],
+ na.rm = TRUE
+ ) / sum(counts$totalN[conditions],
+ na.rm = TRUE
+ )
+ imputed_vector[i] <- imputed_percentage
}
- result
+ missing_vector[i] <- missing_percentage
+ }
+ result$MissingPercentage <- missing_vector
+ if (has_imputed) {
+ result$ImputationPercentage <- imputed_vector
+ }
+ result
}
#' Perform group comparison per protein in parallel
@@ -445,37 +493,42 @@ getSamplesInfo = function(summarization_output) {
#' @param save_fitted_models if TRUE, fitted models will be included in the output
#' @param repeated logical, output of checkRepeatedDesign function
#' @param samples_info data.table, output of getSamplesInfo function
-#' @param numberOfCores Number of cores for parallel processing.
-#' A logfile named `MSstats_groupComparison_log_progress.log` is created to
+#' @param numberOfCores Number of cores for parallel processing.
+#' A logfile named `MSstats_groupComparison_log_progress.log` is created to
#' track progress. Only works for Linux & Mac OS.
#' @importFrom parallel makeCluster clusterExport parLapply stopCluster
#' @keywords internal
-.groupComparisonWithMultipleCores = function(summarized_list, contrast_matrix,
- save_fitted_models, repeated, samples_info,
- numberOfCores) {
- groups = colnames(contrast_matrix)
- has_imputed = attr(summarized_list, "has_imputed")
- all_proteins_id = seq_along(summarized_list)
- function_environment = environment()
- cl = parallel::makeCluster(numberOfCores)
- parallel::clusterExport(cl, c("MSstatsGroupComparisonSingleProtein",
- "contrast_matrix", "repeated", "groups",
- "samples_info", "save_fitted_models", "has_imputed"),
- envir = function_environment)
- cat(paste0("Number of proteins to process: ", length(all_proteins_id)),
- sep = "\n", file = "MSstats_groupComparison_log_progress.log")
- test_results = parallel::parLapply(cl, all_proteins_id, function(i) {
- if (i %% 100 == 0) {
- cat("Finished processing an additional 100 protein comparisons",
- sep = "\n", file = "MSstats_groupComparison_log_progress.log", append = TRUE)
- }
- MSstatsGroupComparisonSingleProtein(
- summarized_list[[i]], contrast_matrix, repeated,
- groups, samples_info, save_fitted_models, has_imputed
- )
- })
- parallel::stopCluster(cl)
- test_results
+.groupComparisonWithMultipleCores <- function(summarized_list, contrast_matrix,
+ save_fitted_models, repeated, samples_info,
+ numberOfCores) {
+ groups <- colnames(contrast_matrix)
+ has_imputed <- attr(summarized_list, "has_imputed")
+ all_proteins_id <- seq_along(summarized_list)
+ function_environment <- environment()
+ cl <- parallel::makeCluster(numberOfCores)
+ parallel::clusterExport(cl, c(
+ "MSstatsGroupComparisonSingleProtein",
+ "contrast_matrix", "repeated", "groups",
+ "samples_info", "save_fitted_models", "has_imputed"
+ ),
+ envir = function_environment
+ )
+ cat(paste0("Number of proteins to process: ", length(all_proteins_id)),
+ sep = "\n", file = "MSstats_groupComparison_log_progress.log"
+ )
+ test_results <- parallel::parLapply(cl, all_proteins_id, function(i) {
+ if (i %% 100 == 0) {
+ cat("Finished processing an additional 100 protein comparisons",
+ sep = "\n", file = "MSstats_groupComparison_log_progress.log", append = TRUE
+ )
+ }
+ MSstatsGroupComparisonSingleProtein(
+ summarized_list[[i]], contrast_matrix, repeated,
+ groups, samples_info, save_fitted_models, has_imputed
+ )
+ })
+ parallel::stopCluster(cl)
+ test_results
}
#' Perform group comparison per protein iteratively with a single loop
@@ -486,23 +539,22 @@ getSamplesInfo = function(summarization_output) {
#' @param samples_info data.table, output of getSamplesInfo function
#' @importFrom utils txtProgressBar setTxtProgressBar
#' @keywords internal
-.groupComparisonWithSingleCore = function(summarized_list, contrast_matrix,
- save_fitted_models, repeated,
- samples_info) {
- groups = colnames(contrast_matrix)
- has_imputed = attr(summarized_list, "has_imputed")
- all_proteins_id = seq_along(summarized_list)
- test_results = vector("list", length(all_proteins_id))
- pb = txtProgressBar(max = length(all_proteins_id), style = 3)
- for (i in all_proteins_id) {
- comparison_outputs = MSstatsGroupComparisonSingleProtein(
- summarized_list[[i]], contrast_matrix, repeated,
- groups, samples_info, save_fitted_models, has_imputed
- )
- test_results[[i]] = comparison_outputs
- setTxtProgressBar(pb, i)
- }
- close(pb)
- test_results
+.groupComparisonWithSingleCore <- function(summarized_list, contrast_matrix,
+ save_fitted_models, repeated,
+ samples_info) {
+ groups <- colnames(contrast_matrix)
+ has_imputed <- attr(summarized_list, "has_imputed")
+ all_proteins_id <- seq_along(summarized_list)
+ test_results <- vector("list", length(all_proteins_id))
+ pb <- txtProgressBar(max = length(all_proteins_id), style = 3)
+ for (i in all_proteins_id) {
+ comparison_outputs <- MSstatsGroupComparisonSingleProtein(
+ summarized_list[[i]], contrast_matrix, repeated,
+ groups, samples_info, save_fitted_models, has_imputed
+ )
+ test_results[[i]] <- comparison_outputs
+ setTxtProgressBar(pb, i)
+ }
+ close(pb)
+ test_results
}
-
diff --git a/R/utils_groupcomparison_checks.R b/R/utils_groupcomparison_checks.R
index d972edeb..4dd100c8 100644
--- a/R/utils_groupcomparison_checks.R
+++ b/R/utils_groupcomparison_checks.R
@@ -1,17 +1,21 @@
#' Check if groupComparison input was processed by the dataProcess function
#' @param input data.table
#' @keywords internal
-.checkGroupComparisonInput = function(input) {
- cols = c("RUN", "Protein", "LogIntensities", "originalRUN",
- "GROUP", "SUBJECT", "more50missing", "NumMeasuredFeature")
-
- if (length(setdiff(toupper(cols), toupper(colnames(input)))) > 0) {
- msg = paste("The `data` input was not processed by the dataProcess function.",
- "Please use the dataProcess function first.")
- getOption("MSstatsLog")("INFO", msg)
- stop(msg)
- }
- input
+.checkGroupComparisonInput <- function(input) {
+ cols <- c(
+ "RUN", "Protein", "LogIntensities", "originalRUN",
+ "GROUP", "SUBJECT", "more50missing", "NumMeasuredFeature"
+ )
+
+ if (length(setdiff(toupper(cols), toupper(colnames(input)))) > 0) {
+ msg <- paste(
+ "The `data` input was not processed by the dataProcess function.",
+ "Please use the dataProcess function first."
+ )
+ getOption("MSstatsLog")("INFO", msg)
+ stop(msg)
+ }
+ input
}
@@ -19,48 +23,56 @@
#' @param contrast_matrix contrast matrix
#' @param input data.table of summarized data
#' @keywords internal
-.checkContrastMatrix = function(contrast_matrix, input) {
- if (ncol(contrast_matrix) != nlevels(input$GROUP)) {
- msg = paste("Number of columns of the contrast.matrix parameter must be",
- "equal to the number of groups.")
- getOption("MSstatsLog")("ERROR", msg)
- stop(msg)
- }
-
- if (any(is.null(row.names(contrast_matrix)))) {
- msg = paste("All rows of the contrast.matrix parameter",
- "must be named")
- getOption("MSstatsLog")("ERROR", msg)
- stop(msg)
- }
- contrast_matrix
+.checkContrastMatrix <- function(contrast_matrix, input) {
+ if (ncol(contrast_matrix) != nlevels(input$GROUP)) {
+ msg <- paste(
+ "Number of columns of the contrast.matrix parameter must be",
+ "equal to the number of groups."
+ )
+ getOption("MSstatsLog")("ERROR", msg)
+ stop(msg)
+ }
+
+ if (any(is.null(row.names(contrast_matrix)))) {
+ msg <- paste(
+ "All rows of the contrast.matrix parameter",
+ "must be named"
+ )
+ getOption("MSstatsLog")("ERROR", msg)
+ stop(msg)
+ }
+ contrast_matrix
}
#' Check if there is only single subject
#' @param input data.table
#' @keywords internal
-.checkSingleSubject = function(input) {
- SUBJECT = GROUP = NULL
-
- unique_annot = unique(input[, list(GROUP, SUBJECT)])
- subject_counts = unique_annot[, list(NumSubjects = data.table::uniqueN(SUBJECT)),
- by = "GROUP"]
- all(subject_counts$NumSubject == 1)
+.checkSingleSubject <- function(input) {
+ SUBJECT <- GROUP <- NULL
+
+ unique_annot <- unique(input[, list(GROUP, SUBJECT)])
+ subject_counts <- unique_annot[, list(NumSubjects = data.table::uniqueN(SUBJECT)),
+ by = "GROUP"
+ ]
+ all(subject_counts$NumSubject == 1)
}
#' Check if there are technical replicates
#' @param input data.table
#' @keywords internal
-.checkTechReplicate = function(input) {
- GROUP = RUN = SUBJECT = NULL
-
- unique_annot = unique(input[, list(RUN,
- SUBJECT_NESTED = paste(GROUP,
- SUBJECT,
- sep = "."))])
- run_counts = unique_annot[, list(NumRuns = data.table::uniqueN(RUN)),
- by = "SUBJECT_NESTED"]
- any(run_counts[["NumRuns"]] > 1)
+.checkTechReplicate <- function(input) {
+ GROUP <- RUN <- SUBJECT <- NULL
+
+ unique_annot <- unique(input[, list(RUN,
+ SUBJECT_NESTED = paste(GROUP,
+ SUBJECT,
+ sep = "."
+ )
+ )])
+ run_counts <- unique_annot[, list(NumRuns = data.table::uniqueN(RUN)),
+ by = "SUBJECT_NESTED"
+ ]
+ any(run_counts[["NumRuns"]] > 1)
}
diff --git a/R/utils_groupcomparison_contrasts.R b/R/utils_groupcomparison_contrasts.R
index d628ed58..d861e2c9 100644
--- a/R/utils_groupcomparison_contrasts.R
+++ b/R/utils_groupcomparison_contrasts.R
@@ -1,95 +1,118 @@
#' @export
-MSstatsContrastMatrix.list = function(contrasts, conditions, labels = NULL) {
- num_conditions = length(conditions)
- contrast_matrix = matrix(0, nrow = length(contrasts),
- ncol = num_conditions)
- for (contrast_id in seq_along(contrasts)) {
- contrast = contrasts[[contrast_id]]
- contrast_vector = rep(0, num_conditions)
- positive = conditions %in% contrast[[1]]
- negative = conditions %in% contrast[[2]]
- contrast_vector[positive] = 1 / sum(positive)
- contrast_vector[negative] = -1 / sum(negative)
- contrast_matrix[contrast_id, ] = contrast_vector
- }
- if (is.null(labels)) {
- row.names(contrast_matrix) = .getContrastLabels(contrasts)
- } else {
- row.names(contrast_matrix) = labels
- }
- colnames(contrast_matrix) = conditions
- contrast_matrix
+MSstatsContrastMatrix.list <- function(contrasts, conditions, labels = NULL) {
+ num_conditions <- length(conditions)
+ contrast_matrix <- matrix(0,
+ nrow = length(contrasts),
+ ncol = num_conditions
+ )
+ for (contrast_id in seq_along(contrasts)) {
+ contrast <- contrasts[[contrast_id]]
+ contrast_vector <- rep(0, num_conditions)
+ positive <- conditions %in% contrast[[1]]
+ negative <- conditions %in% contrast[[2]]
+ contrast_vector[positive] <- 1 / sum(positive)
+ contrast_vector[negative] <- -1 / sum(negative)
+ contrast_matrix[contrast_id, ] <- contrast_vector
+ }
+ if (is.null(labels)) {
+ row.names(contrast_matrix) <- .getContrastLabels(contrasts)
+ } else {
+ row.names(contrast_matrix) <- labels
+ }
+ colnames(contrast_matrix) <- conditions
+ contrast_matrix
}
#' @importFrom utils combn
#' @export
-MSstatsContrastMatrix.character = function(contrasts, conditions, labels = NULL) {
- if (contrasts == "pairwise") {
- contrast_combinations = combn(conditions, 2)
- num_combinations = ncol(contrast_combinations)
- contrasts_list = lapply(seq_len(num_combinations),
- function(x) list(contrast_combinations[1, x],
- contrast_combinations[2, x]))
- MSstatsContrastMatrix.list(contrasts_list, conditions)
- } else {
- stop(paste("Contrast matrix of type", contrasts, "not implemented"))
- }
+MSstatsContrastMatrix.character <- function(contrasts, conditions, labels = NULL) {
+ if (contrasts == "pairwise") {
+ contrast_combinations <- combn(conditions, 2)
+ num_combinations <- ncol(contrast_combinations)
+ contrasts_list <- lapply(
+ seq_len(num_combinations),
+ function(x) {
+ list(
+ contrast_combinations[1, x],
+ contrast_combinations[2, x]
+ )
+ }
+ )
+ MSstatsContrastMatrix.list(contrasts_list, conditions)
+ } else {
+ stop(paste("Contrast matrix of type", contrasts, "not implemented"))
+ }
}
#' @export
-MSstatsContrastMatrix.matrix = function(contrasts, conditions, labels = NULL) {
- groups_matrix = colnames(contrasts)
- checkmate::assertSetEqual(groups_matrix, as.character(conditions),
- .var.name = "colnames of contrast matrix")
- if (is.null(row.names(contrasts))) {
- stop("Row names of the contrast matrix must be the contrast labels")
- }
- contrasts
+MSstatsContrastMatrix.matrix <- function(contrasts, conditions, labels = NULL) {
+ groups_matrix <- colnames(contrasts)
+ checkmate::assertSetEqual(groups_matrix, as.character(conditions),
+ .var.name = "colnames of contrast matrix"
+ )
+ if (is.null(row.names(contrasts))) {
+ stop("Row names of the contrast matrix must be the contrast labels")
+ }
+ contrasts
}
#' @export
-MSstatsContrastMatrix.data.frame = function(contrasts, conditions, labels) {
- groups_matrix = colnames(contrasts)
- checkmate::assertSetEqual(groups_matrix, as.character(conditions),
- .var.name = "colnames of contrast matrix")
- if (is.null(row.names(contrasts))) {
- stop("Row names of the contrast matrix must be the contrast labels")
- }
- as.matrix(contrasts)
+MSstatsContrastMatrix.data.frame <- function(contrasts, conditions, labels) {
+ groups_matrix <- colnames(contrasts)
+ checkmate::assertSetEqual(groups_matrix, as.character(conditions),
+ .var.name = "colnames of contrast matrix"
+ )
+ if (is.null(row.names(contrasts))) {
+ stop("Row names of the contrast matrix must be the contrast labels")
+ }
+ as.matrix(contrasts)
}
#' Create a contrast matrix for groupComparison function
-#'
+#'
#' @param contrasts One of the following:
-#' i) list of lists. Each sub-list consists of two vectors that name
+#' i) list of lists. Each sub-list consists of two vectors that name
#' conditions that will be compared. See the details section for more information
#' ii) matrix. In this case, it's correctness will be checked
#' iii) "pairwise". In this case, pairwise comparison matrix will be generated
#' iv) data.frame. In this case, input will be converted to matrix
-#' @param conditions unique condition labels
+#' @param conditions unique condition labels
#' @param labels labels for contrasts (row.names of the contrast matrix)
-#'
+#'
#' @export
-#'
-MSstatsContrastMatrix = function(contrasts, conditions, labels = NULL) {
- UseMethod("MSstatsContrastMatrix", contrasts)
+#'
+MSstatsContrastMatrix <- function(contrasts, conditions, labels = NULL) {
+ UseMethod("MSstatsContrastMatrix", contrasts)
}
-#' Get labels for contrasts
+#' Get labels for contrasts
#' @param contrasts list of lists of condition labels
#' @keywords internal
-.getContrastLabels = function(contrasts) {
- prelabels = lapply(contrasts,
- function(x) sapply(x,
- function(y)
- paste(y, sep = ",",
- collapse = ",")))
- labels = sapply(prelabels, function(x) paste(x, sep = " vs ",
- collapse = " vs "))
- labels
+.getContrastLabels <- function(contrasts) {
+ prelabels <- lapply(
+ contrasts,
+ function(x) {
+ sapply(
+ x,
+ function(y) {
+ paste(y,
+ sep = ",",
+ collapse = ","
+ )
+ }
+ )
+ }
+ )
+ labels <- sapply(prelabels, function(x) {
+ paste(x,
+ sep = " vs ",
+ collapse = " vs "
+ )
+ })
+ labels
}
diff --git a/R/utils_groupcomparison_plots.R b/R/utils_groupcomparison_plots.R
index e6040fc1..7998d44d 100644
--- a/R/utils_groupcomparison_plots.R
+++ b/R/utils_groupcomparison_plots.R
@@ -4,54 +4,60 @@
#' @param selected_labels character vector of contrast labels
#' @param all_labels character vector of all contrast labels
#' @keywords internal
-.checkGCPlotsInput = function(type, log_base, selected_labels, all_labels) {
- checkmate::assertChoice(type, c("HEATMAP", "VOLCANOPLOT", "COMPARISONPLOT"))
- checkmate::assertChoice(log_base, c(2, 10))
- if (selected_labels != "all") {
- if (is.character(selected_labels)) {
- chosen_labels = selected_labels
- print("labels")
- print(chosen_labels)
- wrong_labels = setdiff(chosen_labels, all_labels)
- if (length(wrong_labels) > 0) {
- msg_1 = paste("Please check labels of comparisons.",
- "Result does not have the following comparisons:")
- msg_2 = paste(wrong_labels, sep = ", ", collapse = ", ")
- msg = paste(msg_1, msg_2)
- stop(msg)
- }
- }
- if (is.numeric(selected_labels)) {
- n_labels = length(all_labels)
- if (n_labels < max(selected_labels)) {
- msg = paste("Please check your selection of comparisons. There are",
- n_labels, "comparisons in this result.")
- stop(msg)
- } else {
- chosen_labels = all_labels[selected_labels]
- }
- }
- } else {
- chosen_labels = all_labels
+.checkGCPlotsInput <- function(type, log_base, selected_labels, all_labels) {
+ checkmate::assertChoice(type, c("HEATMAP", "VOLCANOPLOT", "COMPARISONPLOT"))
+ checkmate::assertChoice(log_base, c(2, 10))
+ if (selected_labels != "all") {
+ if (is.character(selected_labels)) {
+ chosen_labels <- selected_labels
+ print("labels")
+ print(chosen_labels)
+ wrong_labels <- setdiff(chosen_labels, all_labels)
+ if (length(wrong_labels) > 0) {
+ msg_1 <- paste(
+ "Please check labels of comparisons.",
+ "Result does not have the following comparisons:"
+ )
+ msg_2 <- paste(wrong_labels, sep = ", ", collapse = ", ")
+ msg <- paste(msg_1, msg_2)
+ stop(msg)
+ }
+ }
+ if (is.numeric(selected_labels)) {
+ n_labels <- length(all_labels)
+ if (n_labels < max(selected_labels)) {
+ msg <- paste(
+ "Please check your selection of comparisons. There are",
+ n_labels, "comparisons in this result."
+ )
+ stop(msg)
+ } else {
+ chosen_labels <- all_labels[selected_labels]
+ }
}
- chosen_labels
+ } else {
+ chosen_labels <- all_labels
+ }
+ chosen_labels
}
#' @importFrom stats quantile dist
#' @keywords internal
-.getOrderedMatrix = function(input, type) {
- input_tmp = input
- input_tmp[is.na(input)] = 50
- if (toupper(type) == "PROTEIN") {
- input = input[hclust(dist(input_tmp), method = "ward.D")$order, ]
- } else if (toupper(type) == "COMPARISON") {
- input = input[, hclust(dist(t(input_tmp)), method = "ward.D")$order]
- } else if (toupper(type) == "BOTH") {
- input = input[hclust(dist(input_tmp), method = "ward.D")$order,
- hclust(dist(t(input)), method = "ward.D")$order]
- }
- input
+.getOrderedMatrix <- function(input, type) {
+ input_tmp <- input
+ input_tmp[is.na(input)] <- 50
+ if (toupper(type) == "PROTEIN") {
+ input <- input[hclust(dist(input_tmp), method = "ward.D")$order, ]
+ } else if (toupper(type) == "COMPARISON") {
+ input <- input[, hclust(dist(t(input_tmp)), method = "ward.D")$order]
+ } else if (toupper(type) == "BOTH") {
+ input <- input[
+ hclust(dist(input_tmp), method = "ward.D")$order,
+ hclust(dist(t(input)), method = "ward.D")$order
+ ]
+ }
+ input
}
@@ -61,119 +67,128 @@ colMin <- function(data) sapply(data, min, na.rm = TRUE)
#' Create colorkey for ggplot2 heatmap
#' @param my.colors blocks
#' @keywords internal
-.getColorKeyGGPlot2 = function(my.colors, blocks) {
- x.at = seq(-0.05, 1.05, length.out = 14)
- par(mar = c(3, 3, 3, 3), mfrow = c(3, 1), oma = c(3, 0, 3, 0))
- plot.new()
- image(z = matrix(seq(seq_len(length(my.colors) -1)), ncol = 1),
- col = my.colors,
- xaxt = "n",
- yaxt = "n")
- mtext("Color Key", side = 3,line = 1, cex = 3)
- mtext("(sign) Adjusted p-value", side = 1, line = 3, at = 0.5, cex = 1.7)
- mtext(blocks, side = 1, line = 1, at = x.at, cex = 1)
+.getColorKeyGGPlot2 <- function(my.colors, blocks) {
+ x.at <- seq(-0.05, 1.05, length.out = 14)
+ par(mar = c(3, 3, 3, 3), mfrow = c(3, 1), oma = c(3, 0, 3, 0))
+ plot.new()
+ image(
+ z = matrix(seq(seq_len(length(my.colors) - 1)), ncol = 1),
+ col = my.colors,
+ xaxt = "n",
+ yaxt = "n"
+ )
+ mtext("Color Key", side = 3, line = 1, cex = 3)
+ mtext("(sign) Adjusted p-value", side = 1, line = 3, at = 0.5, cex = 1.7)
+ mtext(blocks, side = 1, line = 1, at = x.at, cex = 1)
}
#' Create colorkey for plotly heatmap
#' @param my.colors blocks
#' @keywords internal
-.getColorKeyPlotly = function(my.colors, blocks) {
- color.key.plot <- plotly::layout(
- plot_ly(type = "image", z = list(my.colors)),
- xaxis = list(
- dtick = 0,
- ticktext = as.character(blocks),
- tickmode = "array",
- tickvals = -0.5:length(blocks),
- tickangle = 0,
- title = "(sign) Adjusted p-value"
- ),
- yaxis = list(
- ticks = "",
- showticklabels = FALSE
- )
+.getColorKeyPlotly <- function(my.colors, blocks) {
+ color.key.plot <- plotly::layout(
+ plot_ly(type = "image", z = list(my.colors)),
+ xaxis = list(
+ dtick = 0,
+ ticktext = as.character(blocks),
+ tickmode = "array",
+ tickvals = -0.5:length(blocks),
+ tickangle = 0,
+ title = "(sign) Adjusted p-value"
+ ),
+ yaxis = list(
+ ticks = "",
+ showticklabels = FALSE
)
-
- color.key.plot <- plotly::style(color.key.plot, hoverinfo = "none")
- color.key.plot
+ )
+
+ color.key.plot <- plotly::style(color.key.plot, hoverinfo = "none")
+ color.key.plot
}
#' Create heatmap
#' @param input data.table
#' @inheritParams groupComparisonPlots
#' @keywords internal
-.makeHeatmapPlotly = function(input, my.colors, my.breaks, x.axis.size, y.axis.size, height, numProtein) {
- input <- input[1:pmin(numProtein, nrow(input)), ,drop=F]
- par(oma = c(3, 0, 0, 4))
- label_formatter <- list(
- title = "",
- # titlefont = f1,
- showticklabels = TRUE,
- tickangle = 45,
- # tickfont = f2,
- exponentformat = "E")
-
- # adjust my.breaks
- x = my.breaks
- dltx <- diff(x)[1]
- x <- sort(c(x,-dltx/16,dltx/16))
- x <- x[x!=0]
- x.resc <- (x-min(x))/(max(x)-min(x))
+.makeHeatmapPlotly <- function(input, my.colors, my.breaks, x.axis.size, y.axis.size, height, numProtein) {
+ input <- input[1:pmin(numProtein, nrow(input)), , drop = F]
+ par(oma = c(3, 0, 0, 4))
+ label_formatter <- list(
+ title = "",
+ # titlefont = f1,
+ showticklabels = TRUE,
+ tickangle = 45,
+ # tickfont = f2,
+ exponentformat = "E"
+ )
- # get color scale
- cols = my.colors
- colorScale <- data.frame(
- z = c(0,rep(x.resc[2:(length(x.resc)-1)],each=2),1),
- col=rep(cols,each=2)
- )
-
- # Creating the custom hover text matrix
- row_names <- rownames(input)
- col_names <- colnames(input)
- hover_text_matrix <- matrix("", nrow = nrow(input), ncol = ncol(input))
- for (i in 1:nrow(input)) {
- for (j in 1:ncol(input)) {
- hover_text_matrix[i, j] <- sprintf("Comparison: %s
Protein: %s
Value: %.2f",
- col_names[j],
- row_names[i],
- input[i, j])
- }
+ # adjust my.breaks
+ x <- my.breaks
+ dltx <- diff(x)[1]
+ x <- sort(c(x, -dltx / 16, dltx / 16))
+ x <- x[x != 0]
+ x.resc <- (x - min(x)) / (max(x) - min(x))
+
+ # get color scale
+ cols <- my.colors
+ colorScale <- data.frame(
+ z = c(0, rep(x.resc[2:(length(x.resc) - 1)], each = 2), 1),
+ col = rep(cols, each = 2)
+ )
+
+ # Creating the custom hover text matrix
+ row_names <- rownames(input)
+ col_names <- colnames(input)
+ hover_text_matrix <- matrix("", nrow = nrow(input), ncol = ncol(input))
+ for (i in 1:nrow(input)) {
+ for (j in 1:ncol(input)) {
+ hover_text_matrix[i, j] <- sprintf(
+ "Comparison: %s
Protein: %s
Value: %.2f",
+ col_names[j],
+ row_names[i],
+ input[i, j]
+ )
}
-
- heatmap_plot = plot_ly(z = as.matrix(input),
- zmin = x[1],
- zmax = x[length(x)],
- x = colnames(input),
- xgap = 0,
- y = rownames(input),
- ygap = 0,
- type = "heatmap",
- hoverinfo = "text",
- text=hover_text_matrix,
- showlegend = FALSE,
- showscale = FALSE,
- colorscale = colorScale,
- colorbar = list(ypad = 520, tick0 = x[1], dtick = dltx, len = 1, orientation = "h"),
- width = 800)
-
- heatmap_plot <- plotly::layout(heatmap_plot,
- xaxis = label_formatter,
- plot_bgcolor = "grey",
- height = height)
- heatmap_plot
+ }
+
+ heatmap_plot <- plot_ly(
+ z = as.matrix(input),
+ zmin = x[1],
+ zmax = x[length(x)],
+ x = colnames(input),
+ xgap = 0,
+ y = rownames(input),
+ ygap = 0,
+ type = "heatmap",
+ hoverinfo = "text",
+ text = hover_text_matrix,
+ showlegend = FALSE,
+ showscale = FALSE,
+ colorscale = colorScale,
+ colorbar = list(ypad = 520, tick0 = x[1], dtick = dltx, len = 1, orientation = "h"),
+ width = 800
+ )
+
+ heatmap_plot <- plotly::layout(heatmap_plot,
+ xaxis = label_formatter,
+ plot_bgcolor = "grey",
+ height = height
+ )
+ heatmap_plot
}
-.makeHeatmapGgplot2 = function(input, my.colors, my.breaks, x.axis.size, y.axis.size,height) {
- par(oma = c(3, 0, 0, 4))
- heatmap.2(as.matrix(input),
- col = my.colors,
- Rowv = FALSE, Colv = FALSE,
- dendrogram = "none", breaks = my.breaks,
- trace = "none", na.color = "grey",
- cexCol = (x.axis.size / 10),
- cexRow = (y.axis.size / 10),
- key = FALSE,
- lhei = c(0.1, 0.9), lwid = c(0.1, 0.9))
+.makeHeatmapGgplot2 <- function(input, my.colors, my.breaks, x.axis.size, y.axis.size, height) {
+ par(oma = c(3, 0, 0, 4))
+ heatmap.2(as.matrix(input),
+ col = my.colors,
+ Rowv = FALSE, Colv = FALSE,
+ dendrogram = "none", breaks = my.breaks,
+ trace = "none", na.color = "grey",
+ cexCol = (x.axis.size / 10),
+ cexRow = (y.axis.size / 10),
+ key = FALSE,
+ lhei = c(0.1, 0.9), lwid = c(0.1, 0.9)
+ )
}
#' Create a volcano plot
@@ -183,96 +198,136 @@ colMin <- function(data) sapply(data, min, na.rm = TRUE)
#' @param log_base_FC 2 or 10
#' @param log_base_pval 2 or 10
#' @keywords internal
-.makeVolcano = function(
+.makeVolcano <- function(
input, label_name, log_base_FC, log_base_pval, x.lim, ProteinName, dot.size,
y.limdown, y.limup, text.size, FCcutoff, sig, x.axis.size, y.axis.size,
- legend.size, log_adjp
-) {
- Protein = NULL
- plot = ggplot(aes_string(x = "logFC",
- y = log_adjp,
- color = "colgroup",
- label = "Protein"),
- data = input) +
- geom_point(size = dot.size) +
- scale_colour_manual(values = c("gray65", "blue", "red"),
- limits = c("black", "blue", "red"),
- breaks = c("black", "blue", "red"),
- labels = c("No regulation", "Down-regulated", "Up-regulated")) +
- scale_y_continuous(paste0("-Log", log_base_pval, " (adjusted p-value)"),
- limits = c(y.limdown, y.limup)) +
- labs(title = unique(label_name))
- plot = plot +
- scale_x_continuous(paste0("Log", log_base_FC, " fold change"),
- limits = c(-x.lim, x.lim))
- if (ProteinName) {
- if (!(length(unique(input$colgroup)) == 1 & any(unique(input$colgroup) == "black"))) {
- plot = plot +
- geom_text_repel(data = input[input$colgroup != "black", ],
- aes(label = Protein),
- size = text.size,
- col = "black")
- }
- }
- if (!FCcutoff | is.numeric(FCcutoff)) {
- l = ifelse(!FCcutoff, 20, 10)
- sigcut = data.table::setnames(
- data.table::data.table("sigline",
- seq(-x.lim, x.lim, length.out = l),
- (-log(sig, base = log_base_pval)),
- "twodash"),
- c("Protein", "logFC", log_adjp, "line"))
- }
- if (!FCcutoff) {
- plot = plot +
- geom_line(data = sigcut,
- aes_string(x = "logFC", y = log_adjp, linetype = "line"),
- colour = "darkgrey",
- size = 0.6,
- show.legend = TRUE) +
- scale_linetype_manual(values = c("twodash" = 6),
- labels = c(paste0("Adj p-value cutoff (", sig, ")"))) +
- guides(colour = guide_legend(override.aes = list(linetype = 0)),
- linetype = guide_legend())
+ legend.size, log_adjp) {
+ Protein <- NULL
+ plot <- ggplot(
+ aes_string(
+ x = "logFC",
+ y = log_adjp,
+ color = "colgroup",
+ label = "Protein"
+ ),
+ data = input
+ ) +
+ geom_point(size = dot.size) +
+ scale_colour_manual(
+ values = c("gray65", "blue", "red"),
+ limits = c("black", "blue", "red"),
+ breaks = c("black", "blue", "red"),
+ labels = c("No regulation", "Down-regulated", "Up-regulated")
+ ) +
+ scale_y_continuous(paste0("-Log", log_base_pval, " (adjusted p-value)"),
+ limits = c(y.limdown, y.limup)
+ ) +
+ labs(title = unique(label_name))
+ plot <- plot +
+ scale_x_continuous(paste0("Log", log_base_FC, " fold change"),
+ limits = c(-x.lim, x.lim)
+ )
+ if (ProteinName) {
+ if (!(length(unique(input$colgroup)) == 1 & any(unique(input$colgroup) == "black"))) {
+ plot <- plot +
+ geom_text_repel(
+ data = input[input$colgroup != "black", ],
+ aes(label = Protein),
+ size = text.size,
+ col = "black"
+ )
}
- if (is.numeric(FCcutoff)) {
- FCcutpos = data.table::setnames(data.table("sigline",
- log(FCcutoff, log_base_FC),
- seq(y.limdown, y.limup, length.out = 10),
- "dotted"),
- c("Protein", "logFC", log_adjp, "line"))
- FCcutneg = data.table::setnames(data.table("sigline",
- (-log(FCcutoff, log_base_FC)),
- seq(y.limdown, y.limup, length.out = 10),
- "dotted"),
- c("Protein", "logFC", log_adjp, "line"))
- plot = plot +
- geom_line(data = sigcut,
- aes_string(x = "logFC", y = log_adjp, linetype = "line"),
- colour = "darkgrey",
- size = 0.6,
- show.legend = TRUE) +
- geom_line(data = FCcutpos,
- aes_string(x = "logFC", y = log_adjp, linetype = "line"),
- colour = "darkgrey",
- size = 0.6,
- show.legend = TRUE) +
- geom_line(data = FCcutneg,
- aes_string(x = "logFC", y = log_adjp, linetype = "line"),
- colour = "darkgrey",
- size = 0.6) +
- scale_linetype_manual(values = c("dotted" = 3, "twodash" = 6),
- labels = c(paste0("Fold change cutoff (", FCcutoff, ")"),
- paste0("Adj p-value cutoff (", sig, ")"))) +
- guides(colour = guide_legend(override.aes = list(linetype = 0)),
- linetype = guide_legend())
- }
- plot = plot +
- theme_msstats("VOLCANOPLOT", x.axis.size, y.axis.size,
- legend.size, strip_background = element_rect(),
- strip_text_x = element_text(),
- legend_position = "bottom", legend.title = element_blank())
- plot
+ }
+ if (!FCcutoff | is.numeric(FCcutoff)) {
+ l <- ifelse(!FCcutoff, 20, 10)
+ sigcut <- data.table::setnames(
+ data.table::data.table(
+ "sigline",
+ seq(-x.lim, x.lim, length.out = l),
+ (-log(sig, base = log_base_pval)),
+ "twodash"
+ ),
+ c("Protein", "logFC", log_adjp, "line")
+ )
+ }
+ if (!FCcutoff) {
+ plot <- plot +
+ geom_line(
+ data = sigcut,
+ aes_string(x = "logFC", y = log_adjp, linetype = "line"),
+ colour = "darkgrey",
+ size = 0.6,
+ show.legend = TRUE
+ ) +
+ scale_linetype_manual(
+ values = c("twodash" = 6),
+ labels = c(paste0("Adj p-value cutoff (", sig, ")"))
+ ) +
+ guides(
+ colour = guide_legend(override.aes = list(linetype = 0)),
+ linetype = guide_legend()
+ )
+ }
+ if (is.numeric(FCcutoff)) {
+ FCcutpos <- data.table::setnames(
+ data.table(
+ "sigline",
+ log(FCcutoff, log_base_FC),
+ seq(y.limdown, y.limup, length.out = 10),
+ "dotted"
+ ),
+ c("Protein", "logFC", log_adjp, "line")
+ )
+ FCcutneg <- data.table::setnames(
+ data.table(
+ "sigline",
+ (-log(FCcutoff, log_base_FC)),
+ seq(y.limdown, y.limup, length.out = 10),
+ "dotted"
+ ),
+ c("Protein", "logFC", log_adjp, "line")
+ )
+ plot <- plot +
+ geom_line(
+ data = sigcut,
+ aes_string(x = "logFC", y = log_adjp, linetype = "line"),
+ colour = "darkgrey",
+ size = 0.6,
+ show.legend = TRUE
+ ) +
+ geom_line(
+ data = FCcutpos,
+ aes_string(x = "logFC", y = log_adjp, linetype = "line"),
+ colour = "darkgrey",
+ size = 0.6,
+ show.legend = TRUE
+ ) +
+ geom_line(
+ data = FCcutneg,
+ aes_string(x = "logFC", y = log_adjp, linetype = "line"),
+ colour = "darkgrey",
+ size = 0.6
+ ) +
+ scale_linetype_manual(
+ values = c("dotted" = 3, "twodash" = 6),
+ labels = c(
+ paste0("Fold change cutoff (", FCcutoff, ")"),
+ paste0("Adj p-value cutoff (", sig, ")")
+ )
+ ) +
+ guides(
+ colour = guide_legend(override.aes = list(linetype = 0)),
+ linetype = guide_legend()
+ )
+ }
+ plot <- plot +
+ theme_msstats("VOLCANOPLOT", x.axis.size, y.axis.size,
+ legend.size,
+ strip_background = element_rect(),
+ strip_text_x = element_text(),
+ legend_position = "bottom", legend.title = element_blank()
+ )
+ plot
}
@@ -281,31 +336,37 @@ colMin <- function(data) sapply(data, min, na.rm = TRUE)
#' @param log_base 2 or 10
#' @inheritParams groupComparisonPlots
#' @keywords internal
-.makeComparison = function(
- input, log_base, dot.size, x.axis.size, y.axis.size,
- text.angle, hjust, vjust, y.limdown, y.limup
-) {
- logFC = ciw = NULL
-
- protein = unique(input$Protein)
- plot = ggplot(input, aes_string(x = "Label", y = "logFC")) +
- geom_errorbar(aes(ymax = logFC + ciw, ymin = logFC - ciw),
- data = input,
- width = 0.1,
- colour = "red") +
- geom_point(size = dot.size,
- colour = "darkred") +
- scale_x_discrete('Comparison') +
- geom_hline(yintercept = 0,
- linetype = "twodash",
- colour = "darkgrey",
- size = 0.6) +
- labs(title = protein) +
- theme_msstats("COMPARISONPLOT", x.axis.size, y.axis.size,
- text_angle = text.angle, text_hjust = hjust,
- text_vjust = vjust)
- plot = plot +
- scale_y_continuous(paste0("Log", log_base, "-Fold Change"),
- limits = c(y.limdown, y.limup))
- plot
+.makeComparison <- function(
+ input, log_base, dot.size, x.axis.size, y.axis.size,
+ text.angle, hjust, vjust, y.limdown, y.limup) {
+ logFC <- ciw <- NULL
+
+ protein <- unique(input$Protein)
+ plot <- ggplot(input, aes_string(x = "Label", y = "logFC")) +
+ geom_errorbar(aes(ymax = logFC + ciw, ymin = logFC - ciw),
+ data = input,
+ width = 0.1,
+ colour = "red"
+ ) +
+ geom_point(
+ size = dot.size,
+ colour = "darkred"
+ ) +
+ scale_x_discrete("Comparison") +
+ geom_hline(
+ yintercept = 0,
+ linetype = "twodash",
+ colour = "darkgrey",
+ size = 0.6
+ ) +
+ labs(title = protein) +
+ theme_msstats("COMPARISONPLOT", x.axis.size, y.axis.size,
+ text_angle = text.angle, text_hjust = hjust,
+ text_vjust = vjust
+ )
+ plot <- plot +
+ scale_y_continuous(paste0("Log", log_base, "-Fold Change"),
+ limits = c(y.limdown, y.limup)
+ )
+ plot
}
diff --git a/R/utils_imputation.R b/R/utils_imputation.R
index 16f9cd53..12346821 100644
--- a/R/utils_imputation.R
+++ b/R/utils_imputation.R
@@ -1,54 +1,60 @@
#' @importFrom data.table uniqueN
#' @importFrom survival survreg Surv
#' @keywords internal
-.fitSurvival = function(input) {
- FEATURE = RUN = NULL
-
- missingness_filter = is.finite(input$newABUNDANCE)
- n_total = nrow(input[missingness_filter, ])
- n_features = data.table::uniqueN(input[missingness_filter, FEATURE])
- n_runs = data.table::uniqueN(input[missingness_filter, RUN])
- is_labeled = data.table::uniqueN(input$LABEL) > 1
- countdf = n_total < n_features + n_runs - 1
+.fitSurvival <- function(input) {
+ FEATURE <- RUN <- NULL
- # TODO: set.seed here?
- set.seed(100)
- if (is_labeled) {
- if (length(unique(input$FEATURE)) == 1) {
- # with single feature, not converge, wrong intercept
- # need to check
- fit = survreg(Surv(newABUNDANCE, cen, type='left') ~ RUN + ref,
- data = input, dist = "gaussian",
- control = list(maxiter=90))
- } else {
- if (countdf) {
- fit = survreg(Surv(newABUNDANCE, cen, type='left') ~ RUN + ref,
- data = input, dist = "gaussian",
- control = list(maxiter=90))
- } else {
- fit = survreg(Surv(newABUNDANCE, cen, type='left') ~ FEATURE + RUN + ref,
- data = input, dist = "gaussian",
- control = list(maxiter=90))
- }
- }
+ missingness_filter <- is.finite(input$newABUNDANCE)
+ n_total <- nrow(input[missingness_filter, ])
+ n_features <- data.table::uniqueN(input[missingness_filter, FEATURE])
+ n_runs <- data.table::uniqueN(input[missingness_filter, RUN])
+ is_labeled <- data.table::uniqueN(input$LABEL) > 1
+ countdf <- n_total < n_features + n_runs - 1
+
+ # TODO: set.seed here?
+ set.seed(100)
+ if (is_labeled) {
+ if (length(unique(input$FEATURE)) == 1) {
+ # with single feature, not converge, wrong intercept
+ # need to check
+ fit <- survreg(Surv(newABUNDANCE, cen, type = "left") ~ RUN + ref,
+ data = input, dist = "gaussian",
+ control = list(maxiter = 90)
+ )
+ } else {
+ if (countdf) {
+ fit <- survreg(Surv(newABUNDANCE, cen, type = "left") ~ RUN + ref,
+ data = input, dist = "gaussian",
+ control = list(maxiter = 90)
+ )
+ } else {
+ fit <- survreg(Surv(newABUNDANCE, cen, type = "left") ~ FEATURE + RUN + ref,
+ data = input, dist = "gaussian",
+ control = list(maxiter = 90)
+ )
+ }
+ }
+ } else {
+ if (n_features == 1L) {
+ fit <- survreg(Surv(newABUNDANCE, cen, type = "left") ~ RUN,
+ data = input, dist = "gaussian",
+ control = list(maxiter = 90)
+ )
} else {
- if (n_features == 1L) {
- fit = survreg(Surv(newABUNDANCE, cen, type = "left") ~ RUN,
- data = input, dist = "gaussian",
- control = list(maxiter=90))
- } else {
- if (countdf) {
- fit = survreg(Surv(newABUNDANCE, cen, type = "left") ~ RUN,
- data = input, dist = "gaussian",
- control = list(maxiter=90))
- } else {
- fit = survreg(Surv(newABUNDANCE, cen, type = "left") ~ FEATURE + RUN,
- data = input, dist = "gaussian",
- control = list(maxiter=90))
- }
- }
+ if (countdf) {
+ fit <- survreg(Surv(newABUNDANCE, cen, type = "left") ~ RUN,
+ data = input, dist = "gaussian",
+ control = list(maxiter = 90)
+ )
+ } else {
+ fit <- survreg(Surv(newABUNDANCE, cen, type = "left") ~ FEATURE + RUN,
+ data = input, dist = "gaussian",
+ control = list(maxiter = 90)
+ )
+ }
}
- fit
+ }
+ fit
}
@@ -57,9 +63,9 @@
#' @return numeric vector of predictions
#' @importFrom stats predict
#' @keywords internal
-.addSurvivalPredictions = function(input) {
- LABEL = NULL
-
- survival_fit = .fitSurvival(input[LABEL == "L", ])
- predict(survival_fit, newdata = input)
+.addSurvivalPredictions <- function(input) {
+ LABEL <- NULL
+
+ survival_fit <- .fitSurvival(input[LABEL == "L", ])
+ predict(survival_fit, newdata = input)
}
diff --git a/R/utils_normalize.R b/R/utils_normalize.R
index 70820e20..c539a27e 100644
--- a/R/utils_normalize.R
+++ b/R/utils_normalize.R
@@ -1,41 +1,41 @@
#' Normalize MS data
-#'
+#'
#' @param input data.table in MSstats format
#' @param normalization_method name of a chosen normalization method: "NONE" or
#' "FALSE" for no normalization, "EQUALIZEMEDIANS" for median normalization,
#' "QUANTILE" normalization for quantile normalization from `preprocessCore` package,
#' "GLOBALSTANDARDS" for normalization based on selected peptides or proteins.
-#' @param peptides_dict `data.table` of names of peptides and their corresponding
-#' features.
-#' @param standards character vector with names of standards, required if
+#' @param peptides_dict `data.table` of names of peptides and their corresponding
+#' features.
+#' @param standards character vector with names of standards, required if
#' "GLOBALSTANDARDS" method was selected.
-#'
+#'
#' @export
-#'
+#'
#' @return data.table
-#'
+#'
#' @examples
-#' raw = DDARawData
-#' method = "TMP"
-#' cens = "NA"
-#' impute = TRUE
+#' raw <- DDARawData
+#' method <- "TMP"
+#' cens <- "NA"
+#' impute <- TRUE
#' MSstatsConvert::MSstatsLogsSettings(FALSE)
-#' input = MSstatsPrepareForDataProcess(raw, 2, NULL)
-#' input = MSstatsNormalize(input, "EQUALIZEMEDIANS") # median normalization
+#' input <- MSstatsPrepareForDataProcess(raw, 2, NULL)
+#' input <- MSstatsNormalize(input, "EQUALIZEMEDIANS") # median normalization
#' head(input)
-#'
-MSstatsNormalize = function(input, normalization_method, peptides_dict = NULL, standards = NULL) {
- normalization_method = toupper(normalization_method)
- if (normalization_method == "NONE" | normalization_method == "FALSE") {
- return(input)
- } else if (normalization_method == "EQUALIZEMEDIANS") {
- input = .normalizeMedian(input)
- } else if (normalization_method == "QUANTILE") {
- input = .normalizeQuantile(input)
- } else if (normalization_method == "GLOBALSTANDARDS") {
- input = .normalizeGlobalStandards(input, peptides_dict, standards)
- }
- input
+#'
+MSstatsNormalize <- function(input, normalization_method, peptides_dict = NULL, standards = NULL) {
+ normalization_method <- toupper(normalization_method)
+ if (normalization_method == "NONE" | normalization_method == "FALSE") {
+ return(input)
+ } else if (normalization_method == "EQUALIZEMEDIANS") {
+ input <- .normalizeMedian(input)
+ } else if (normalization_method == "QUANTILE") {
+ input <- .normalizeQuantile(input)
+ } else if (normalization_method == "GLOBALSTANDARDS") {
+ input <- .normalizeGlobalStandards(input, peptides_dict, standards)
+ }
+ input
}
@@ -43,23 +43,26 @@ MSstatsNormalize = function(input, normalization_method, peptides_dict = NULL, s
#' @param input `data.table` in standard MSstats format
#' @importFrom stats median
#' @keywords internal
-.normalizeMedian = function(input) {
- ABUNDANCE_RUN = ABUNDANCE_FRACTION = ABUNDANCE = NULL
-
- if (length(unique(input$LABEL)) == 1L) {
- label = "L"
- } else {
- label = "H"
- }
- input[, ABUNDANCE_RUN := .getMedian(.SD, label),
- by = c("RUN", "FRACTION"), .SDcols = c("ABUNDANCE", "LABEL")]
- input[, ABUNDANCE_FRACTION := median(ABUNDANCE_RUN, na.rm = TRUE),
- by = "FRACTION"]
- input[, ABUNDANCE := ABUNDANCE - ABUNDANCE_RUN + ABUNDANCE_FRACTION]
- input = input[, !(colnames(input) %in% c("ABUNDANCE_RUN", "ABUNDANCE_FRACTION")),
- with = FALSE]
- getOption("MSstatsLog")("Normalization based on median: OK")
- input
+.normalizeMedian <- function(input) {
+ ABUNDANCE_RUN <- ABUNDANCE_FRACTION <- ABUNDANCE <- NULL
+
+ if (length(unique(input$LABEL)) == 1L) {
+ label <- "L"
+ } else {
+ label <- "H"
+ }
+ input[, ABUNDANCE_RUN := .getMedian(.SD, label),
+ by = c("RUN", "FRACTION"), .SDcols = c("ABUNDANCE", "LABEL")
+ ]
+ input[, ABUNDANCE_FRACTION := median(ABUNDANCE_RUN, na.rm = TRUE),
+ by = "FRACTION"
+ ]
+ input[, ABUNDANCE := ABUNDANCE - ABUNDANCE_RUN + ABUNDANCE_FRACTION]
+ input <- input[, !(colnames(input) %in% c("ABUNDANCE_RUN", "ABUNDANCE_FRACTION")),
+ with = FALSE
+ ]
+ getOption("MSstatsLog")("Normalization based on median: OK")
+ input
}
@@ -68,67 +71,77 @@ MSstatsNormalize = function(input, normalization_method, peptides_dict = NULL, s
#' @param label "L" for light isotopes, "H" for heavy isotopes.
#' @importFrom stats median
#' @keywords internal
-.getMedian = function(df, label) {
- median(df$ABUNDANCE[df$LABEL == label], na.rm = TRUE)
+.getMedian <- function(df, label) {
+ median(df$ABUNDANCE[df$LABEL == label], na.rm = TRUE)
}
#' Quantile normalization based on the `preprocessCore` package
#' @param input `data.table` in MSstats standard format
#' @keywords internal
-.normalizeQuantile = function(input) {
- ABUNDANCE = FRACTION = RUN = LABEL = GROUP_ORIGINAL = NULL
- SUBJECT_ORIGINAL = PROTEIN = PEPTIDE = TRANSITION = INTENSITY = NULL
-
- input[ABUNDANCE == 0, "ABUNDANCE"] = 1
- fractions = unique(input$FRACTION)
- is_labeled = data.table::uniqueN(input$LABEL) > 1
- per_fraction_normalized = vector("list", length(fractions))
-
- if (!is_labeled) {
- grouping_cols = c("FEATURE", "RUN")
- for (fraction_id in fractions) {
- fraction_runs = as.character(unique(input[FRACTION == fractions[fraction_id], RUN]))
- wide = .getWideTable(input, fraction_runs)
- normalized = .quantileNormalizationSingleLabel(wide, fraction_runs)
- normalized = data.table::melt(normalized, id.vars = "FEATURE",
- variable.name = "RUN", value.name = "ABUNDANCE")
- per_fraction_normalized[[fraction_id]] = normalized
- }
- } else {
- grouping_cols = c("LABEL", "FEATURE", "RUN")
- for (fraction_id in fractions) {
- fraction_runs = as.character(unique(input[FRACTION == fractions[fraction_id], RUN]))
- wide_h = .getWideTable(input, fraction_runs, "H", FALSE)
- normalized_h = .quantileNormalizationSingleLabel(wide_h,
- fraction_runs, "H")
- normalized_h$LABEL = "H"
- wide_l = .getWideTable(input, fraction_runs)
- for (run_col in fraction_runs) {
- wide_l[[run_col]] = wide_l[[run_col]] - wide_h[[run_col]] + normalized_h[[run_col]]
- }
- wide_l$LABEL = "L"
- per_fraction_normalized[[fraction_id]] = data.table::melt(
- rbind(normalized_h, wide_l), id.vars = c("LABEL", "FEATURE"),
- variable.name = "RUN", value.name = "ABUNDANCE")
- }
+.normalizeQuantile <- function(input) {
+ ABUNDANCE <- FRACTION <- RUN <- LABEL <- GROUP_ORIGINAL <- NULL
+ SUBJECT_ORIGINAL <- PROTEIN <- PEPTIDE <- TRANSITION <- INTENSITY <- NULL
+
+ input[ABUNDANCE == 0, "ABUNDANCE"] <- 1
+ fractions <- unique(input$FRACTION)
+ is_labeled <- data.table::uniqueN(input$LABEL) > 1
+ per_fraction_normalized <- vector("list", length(fractions))
+
+ if (!is_labeled) {
+ grouping_cols <- c("FEATURE", "RUN")
+ for (fraction_id in fractions) {
+ fraction_runs <- as.character(unique(input[FRACTION == fractions[fraction_id], RUN]))
+ wide <- .getWideTable(input, fraction_runs)
+ normalized <- .quantileNormalizationSingleLabel(wide, fraction_runs)
+ normalized <- data.table::melt(normalized,
+ id.vars = "FEATURE",
+ variable.name = "RUN", value.name = "ABUNDANCE"
+ )
+ per_fraction_normalized[[fraction_id]] <- normalized
}
-
- per_fraction_normalized = data.table::rbindlist(per_fraction_normalized)
- input = merge(input[, colnames(input) != "ABUNDANCE", with = FALSE],
- per_fraction_normalized, by = grouping_cols)
- input = input[order(LABEL, GROUP_ORIGINAL, SUBJECT_ORIGINAL,
- RUN, PROTEIN, PEPTIDE, TRANSITION), ]
- input[!is.na(INTENSITY) & INTENSITY == 1, "ABUNDANCE"] = 0 # Skyline
-
- if (length(fractions) == 1L) {
- msg = "Normalization : Quantile normalization - okay"
- } else {
- msg = "Normalization : Quantile normalization per fraction - okay"
+ } else {
+ grouping_cols <- c("LABEL", "FEATURE", "RUN")
+ for (fraction_id in fractions) {
+ fraction_runs <- as.character(unique(input[FRACTION == fractions[fraction_id], RUN]))
+ wide_h <- .getWideTable(input, fraction_runs, "H", FALSE)
+ normalized_h <- .quantileNormalizationSingleLabel(
+ wide_h,
+ fraction_runs, "H"
+ )
+ normalized_h$LABEL <- "H"
+ wide_l <- .getWideTable(input, fraction_runs)
+ for (run_col in fraction_runs) {
+ wide_l[[run_col]] <- wide_l[[run_col]] - wide_h[[run_col]] + normalized_h[[run_col]]
+ }
+ wide_l$LABEL <- "L"
+ per_fraction_normalized[[fraction_id]] <- data.table::melt(
+ rbind(normalized_h, wide_l),
+ id.vars = c("LABEL", "FEATURE"),
+ variable.name = "RUN", value.name = "ABUNDANCE"
+ )
}
- getOption("MSstatsLog")("INFO", msg)
-
- input
+ }
+
+ per_fraction_normalized <- data.table::rbindlist(per_fraction_normalized)
+ input <- merge(input[, colnames(input) != "ABUNDANCE", with = FALSE],
+ per_fraction_normalized,
+ by = grouping_cols
+ )
+ input <- input[order(
+ LABEL, GROUP_ORIGINAL, SUBJECT_ORIGINAL,
+ RUN, PROTEIN, PEPTIDE, TRANSITION
+ ), ]
+ input[!is.na(INTENSITY) & INTENSITY == 1, "ABUNDANCE"] <- 0 # Skyline
+
+ if (length(fractions) == 1L) {
+ msg <- "Normalization : Quantile normalization - okay"
+ } else {
+ msg <- "Normalization : Quantile normalization per fraction - okay"
+ }
+ getOption("MSstatsLog")("INFO", msg)
+
+ input
}
@@ -138,21 +151,23 @@ MSstatsNormalize = function(input, normalization_method, peptides_dict = NULL, s
#' @param label "L" for light isotopes, "H" for heavy isotopes
#' @param remove_missing if TRUE, only non-missing values will be considered
#' @keywords internal
-.getWideTable = function(input, runs, label = "L", remove_missing = TRUE) {
- RUN = NULL
-
- if (remove_missing) {
- nonmissing_filter = !is.na(input$INTENSITY)
- } else {
- nonmissing_filter = rep(TRUE, nrow(input))
- }
- label_filter = input$LABEL == label
-
- wide = data.table::dcast(input[nonmissing_filter & label_filter & (RUN %in% runs)],
- FEATURE ~ RUN, value.var = "ABUNDANCE")
- wide = wide[, lapply(.SD, .replaceZerosWithNA)]
- colnames(wide)[-1] = runs
- wide
+.getWideTable <- function(input, runs, label = "L", remove_missing = TRUE) {
+ RUN <- NULL
+
+ if (remove_missing) {
+ nonmissing_filter <- !is.na(input$INTENSITY)
+ } else {
+ nonmissing_filter <- rep(TRUE, nrow(input))
+ }
+ label_filter <- input$LABEL == label
+
+ wide <- data.table::dcast(input[nonmissing_filter & label_filter & (RUN %in% runs)],
+ FEATURE ~ RUN,
+ value.var = "ABUNDANCE"
+ )
+ wide <- wide[, lapply(.SD, .replaceZerosWithNA)]
+ colnames(wide)[-1] <- runs
+ wide
}
@@ -162,26 +177,27 @@ MSstatsNormalize = function(input, normalization_method, peptides_dict = NULL, s
#' @param label "L" for light isotopes, "H" for heavy isotopes
#' @importFrom preprocessCore normalize.quantiles
#' @keywords internal
-.quantileNormalizationSingleLabel = function(input, runs, label = "L") {
- FEATURE = NULL
-
- normalized = input[, list(FEATURE, preprocessCore::normalize.quantiles(as.matrix(.SD))),
- .SDcols = runs]
- colnames(normalized)[-1] = runs
- normalized
+.quantileNormalizationSingleLabel <- function(input, runs, label = "L") {
+ FEATURE <- NULL
+
+ normalized <- input[, list(FEATURE, preprocessCore::normalize.quantiles(as.matrix(.SD))),
+ .SDcols = runs
+ ]
+ colnames(normalized)[-1] <- runs
+ normalized
}
#' Utility function for normalization: replace 0s by NA
#' @param vec vector
#' @keywords internal
-.replaceZerosWithNA = function(vec) {
- vec = unlist(vec, FALSE, FALSE)
- if (is.character(vec) | is.factor(vec)) {
- vec
- } else {
- ifelse(vec == 0, NA, vec)
- }
+.replaceZerosWithNA <- function(vec) {
+ vec <- unlist(vec, FALSE, FALSE)
+ if (is.character(vec) | is.factor(vec)) {
+ vec
+ } else {
+ ifelse(vec == 0, NA, vec)
+ }
}
@@ -189,163 +205,194 @@ MSstatsNormalize = function(input, normalization_method, peptides_dict = NULL, s
#' @inheritParams MSstatsNormalize
#' @importFrom data.table melt uniqueN
#' @keywords internal
-.normalizeGlobalStandards = function(input, peptides_dict, standards) {
- PeptideSequence = PEPTIDE = PROTEIN = median_by_fraction = NULL
- Standard = FRACTION = LABEL = ABUNDANCE = RUN = GROUP = NULL
-
- proteins = as.character(unique(input$PROTEIN))
- means_by_standard = unique(input[, list(RUN)])
- for (standard_id in seq_along(standards)) {
- peptide_name = unlist(peptides_dict[PeptideSequence == standards[standard_id],
- as.character(PEPTIDE)], FALSE, FALSE)
- if (length(peptide_name) > 0) {
- standard = input[PEPTIDE == peptide_name, ]
- } else {
- if (standards[standard_id] %in% proteins) {
- standard = input[PROTEIN == standards[standard_id], ]
- } else {
- msg = paste("global standard peptides or proteins, ",
- standards[standard_id],", is not in dataset.",
- "Please check whether 'nameStandards' input is correct or not.")
- getOption("MSstatsLog")("ERROR", msg)
- stop(msg)
- }
- }
- mean_by_run = standard[GROUP != "0" & !is.na(ABUNDANCE),
- list(mean_abundance = mean(ABUNDANCE, na.rm = TRUE)),
- by = "RUN"]
- colnames(mean_by_run)[2] = paste0("meanStandard", standard_id)
- means_by_standard = merge(means_by_standard, mean_by_run,
- by = "RUN", all.x = TRUE)
- }
- means_by_standard = data.table::melt(means_by_standard, id.vars = "RUN",
- variable.name = "Standard", value.name = "ABUNDANCE")
- means_by_standard[, mean_by_run := mean(ABUNDANCE, na.rm = TRUE), by = "RUN"]
- means_by_standard = merge(means_by_standard, unique(input[, list(RUN, FRACTION)]),
- by = "RUN")
- means_by_standard[, median_by_fraction := median(mean_by_run, na.rm = TRUE),
- by = "FRACTION"]
- means_by_standard[, ABUNDANCE := NULL]
- means_by_standard[, Standard := NULL]
- means_by_standard = unique(means_by_standard)
-
- input = merge(input, means_by_standard, all.x = TRUE, by = c("RUN", "FRACTION"))
- input[, ABUNDANCE := ifelse(LABEL == "L", ABUNDANCE - mean_by_run + median_by_fraction, ABUNDANCE)]
-
- if (data.table::uniqueN(input$FRACTION) == 1L) {
- msg = "Normalization : normalization with global standards protein - okay"
+.normalizeGlobalStandards <- function(input, peptides_dict, standards) {
+ PeptideSequence <- PEPTIDE <- PROTEIN <- median_by_fraction <- NULL
+ Standard <- FRACTION <- LABEL <- ABUNDANCE <- RUN <- GROUP <- NULL
+
+ proteins <- as.character(unique(input$PROTEIN))
+ means_by_standard <- unique(input[, list(RUN)])
+ for (standard_id in seq_along(standards)) {
+ peptide_name <- unlist(peptides_dict[
+ PeptideSequence == standards[standard_id],
+ as.character(PEPTIDE)
+ ], FALSE, FALSE)
+ if (length(peptide_name) > 0) {
+ standard <- input[PEPTIDE == peptide_name, ]
} else {
- msg = "Normalization : normalization with global standards protein - okay"
+ if (standards[standard_id] %in% proteins) {
+ standard <- input[PROTEIN == standards[standard_id], ]
+ } else {
+ msg <- paste(
+ "global standard peptides or proteins, ",
+ standards[standard_id], ", is not in dataset.",
+ "Please check whether 'nameStandards' input is correct or not."
+ )
+ getOption("MSstatsLog")("ERROR", msg)
+ stop(msg)
+ }
}
- getOption("MSstatsLog")("INFO", msg)
- input[ , !(colnames(input) %in% c("mean_by_run", "median_by_fraction")), with = FALSE]
+ mean_by_run <- standard[GROUP != "0" & !is.na(ABUNDANCE),
+ list(mean_abundance = mean(ABUNDANCE, na.rm = TRUE)),
+ by = "RUN"
+ ]
+ colnames(mean_by_run)[2] <- paste0("meanStandard", standard_id)
+ means_by_standard <- merge(means_by_standard, mean_by_run,
+ by = "RUN", all.x = TRUE
+ )
+ }
+ means_by_standard <- data.table::melt(means_by_standard,
+ id.vars = "RUN",
+ variable.name = "Standard", value.name = "ABUNDANCE"
+ )
+ means_by_standard[, mean_by_run := mean(ABUNDANCE, na.rm = TRUE), by = "RUN"]
+ means_by_standard <- merge(means_by_standard, unique(input[, list(RUN, FRACTION)]),
+ by = "RUN"
+ )
+ means_by_standard[, median_by_fraction := median(mean_by_run, na.rm = TRUE),
+ by = "FRACTION"
+ ]
+ means_by_standard[, ABUNDANCE := NULL]
+ means_by_standard[, Standard := NULL]
+ means_by_standard <- unique(means_by_standard)
+
+ input <- merge(input, means_by_standard, all.x = TRUE, by = c("RUN", "FRACTION"))
+ input[, ABUNDANCE := ifelse(LABEL == "L", ABUNDANCE - mean_by_run + median_by_fraction, ABUNDANCE)]
+
+ if (data.table::uniqueN(input$FRACTION) == 1L) {
+ msg <- "Normalization : normalization with global standards protein - okay"
+ } else {
+ msg <- "Normalization : normalization with global standards protein - okay"
+ }
+ getOption("MSstatsLog")("INFO", msg)
+ input[, !(colnames(input) %in% c("mean_by_run", "median_by_fraction")), with = FALSE]
}
#' Re-format the data before feature selection
-#'
+#'
#' @param input `data.table` in MSstats format
-#'
+#'
#' @importFrom data.table uniqueN
-#'
+#'
#' @export
-#'
+#'
#' @return data.table
-#'
-#' @examples
-#' raw = DDARawData
-#' method = "TMP"
-#' cens = "NA"
-#' impute = TRUE
+#'
+#' @examples
+#' raw <- DDARawData
+#' method <- "TMP"
+#' cens <- "NA"
+#' impute <- TRUE
#' MSstatsConvert::MSstatsLogsSettings(FALSE)
-#' input = MSstatsPrepareForDataProcess(raw, 2, NULL)
-#' input = MSstatsNormalize(input, "EQUALIZEMEDIANS")
-#' input = MSstatsMergeFractions(input)
+#' input <- MSstatsPrepareForDataProcess(raw, 2, NULL)
+#' input <- MSstatsNormalize(input, "EQUALIZEMEDIANS")
+#' input <- MSstatsMergeFractions(input)
#' head(input)
-#'
-MSstatsMergeFractions = function(input) {
- ABUNDANCE = INTENSITY = GROUP_ORIGINAL = SUBJECT_ORIGINAL = RUN = NULL
- originalRUN = FRACTION = TECHREPLICATE = tmp = merged = newRun = NULL
- ncount = FEATURE = NULL
-
- input[!is.na(ABUNDANCE) & ABUNDANCE < 0, "ABUNDANCE"] = 0
- input[!is.na(INTENSITY) & INTENSITY == 1, "ABUNDANCE"] = 0
- if (data.table::uniqueN(input$FRACTION) == 1) {
- return(input)
- } else {
- if (is.element("TECHREPLICATE", colnames(input))) {
- run_info = unique(input[,
- list(GROUP_ORIGINAL, SUBJECT_ORIGINAL, RUN,
- originalRUN, FRACTION, TECHREPLICATE)])
- match_runs = try(
- data.table::dcast(run_info,
- GROUP_ORIGINAL + SUBJECT_ORIGINAL + TECHREPLICATE ~ FRACTION,
- value.var = "originalRUN"), silent = TRUE
- )
- if (inherits(match_runs, "try-error")) {
- msg = "*** error : can't figure out which multiple runs come from the same sample."
- getOption("MSstatsLog")("ERROR", msg)
- stop(msg)
- } else {
- input$newRun = NA
- input$newRun = as.character(input$newRun)
- run_info[, GROUP_ORIGINAL := as.character(GROUP_ORIGINAL)]
- run_info[, SUBJECT_ORIGINAL := as.character(SUBJECT_ORIGINAL)]
- for (k in seq_len(nrow(run_info))) {
- input[originalRUN %in% run_info$originalRUN[k], "newRun"] = paste(paste(run_info[k, 1:4], collapse = "_"), 'merged', sep = "_")
- }
-
- select_fraction = input[!is.na(ABUNDANCE) & ABUNDANCE > 0,
- list(ncount = .N),
- by = c("FEATURE", "FRACTION")]
- select_fraction = select_fraction[ncount == data.table::uniqueN(input$newRun)]
- select_fraction[, tmp := paste(FEATURE, FRACTION, sep = "_")]
- input$tmp = paste(input$FEATURE, input$FRACTION, sep="_")
- input = input[!(tmp %in% select_fraction$tmp), ]
- input$originalRUN = input$newRun
- input$RUN = input$originalRUN
- input$RUN = factor(input$RUN, levels=unique(input$RUN), labels=seq(1, length(unique(input$RUN))))
- input = input[, !(colnames(input) %in% c('tmp','newRun')), with = FALSE]
- }
- } else {
- run_info = unique(input[,
- list(GROUP_ORIGINAL, SUBJECT_ORIGINAL, RUN,
- originalRUN, FRACTION)])
- match_runs = try(
- data.table::dcast(run_info,
- GROUP_ORIGINAL + SUBJECT_ORIGINAL ~ FRACTION,
- value.var = "originalRUN"), silent = TRUE
- )
- if (inherits(match_runs, "try-error")) {
- msg = "*** error : can't figure out which multiple runs come from the same sample."
- getOption("MSstatsLog")("ERROR", msg)
- stop(msg)
- } else {
- match_runs[, merged := "merged"]
- match_runs[, newRun := do.call(paste, c(.SD, sep = "_")),
- .SDcols = c(1:3, ncol(match_runs))]
- match_runs = unique(match_runs[, list(GROUP_ORIGINAL,
- SUBJECT_ORIGINAL,
- newRun)])
-
- input = merge(input, match_runs,
- by = c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL"),
- all.x = TRUE)
- select_fraction = input[!is.na(ABUNDANCE) & input$ABUNDANCE > 0,
- list(ncount = .N),
- by = c("FEATURE", "FRACTION")]
- select_fraction = select_fraction[ncount != 0]
- select_fraction[, tmp := paste(FEATURE, FRACTION, sep = "_")]
- input$tmp = paste(input$FEATURE, input$FRACTION, sep = "_")
- input = input[tmp %in% select_fraction$tmp, ]
- input$originalRUN = input$newRun
- input$RUN = input$originalRUN
- input$RUN = factor(input$RUN, levels = unique(input$RUN),
- labels = seq_along(unique(input$RUN)))
- input = input[, !(colnames(input) %in% c('tmp','newRun')),
- with = FALSE]
- }
+#'
+MSstatsMergeFractions <- function(input) {
+ ABUNDANCE <- INTENSITY <- GROUP_ORIGINAL <- SUBJECT_ORIGINAL <- RUN <- NULL
+ originalRUN <- FRACTION <- TECHREPLICATE <- tmp <- merged <- newRun <- NULL
+ ncount <- FEATURE <- NULL
+
+ input[!is.na(ABUNDANCE) & ABUNDANCE < 0, "ABUNDANCE"] <- 0
+ input[!is.na(INTENSITY) & INTENSITY == 1, "ABUNDANCE"] <- 0
+ if (data.table::uniqueN(input$FRACTION) == 1) {
+ return(input)
+ } else {
+ if (is.element("TECHREPLICATE", colnames(input))) {
+ run_info <- unique(input[
+ ,
+ list(
+ GROUP_ORIGINAL, SUBJECT_ORIGINAL, RUN,
+ originalRUN, FRACTION, TECHREPLICATE
+ )
+ ])
+ match_runs <- try(
+ data.table::dcast(run_info,
+ GROUP_ORIGINAL + SUBJECT_ORIGINAL + TECHREPLICATE ~ FRACTION,
+ value.var = "originalRUN"
+ ),
+ silent = TRUE
+ )
+ if (inherits(match_runs, "try-error")) {
+ msg <- "*** error : can't figure out which multiple runs come from the same sample."
+ getOption("MSstatsLog")("ERROR", msg)
+ stop(msg)
+ } else {
+ input$newRun <- NA
+ input$newRun <- as.character(input$newRun)
+ run_info[, GROUP_ORIGINAL := as.character(GROUP_ORIGINAL)]
+ run_info[, SUBJECT_ORIGINAL := as.character(SUBJECT_ORIGINAL)]
+ for (k in seq_len(nrow(run_info))) {
+ input[originalRUN %in% run_info$originalRUN[k], "newRun"] <- paste(paste(run_info[k, 1:4], collapse = "_"), "merged", sep = "_")
}
+
+ select_fraction <- input[!is.na(ABUNDANCE) & ABUNDANCE > 0,
+ list(ncount = .N),
+ by = c("FEATURE", "FRACTION")
+ ]
+ select_fraction <- select_fraction[ncount == data.table::uniqueN(input$newRun)]
+ select_fraction[, tmp := paste(FEATURE, FRACTION, sep = "_")]
+ input$tmp <- paste(input$FEATURE, input$FRACTION, sep = "_")
+ input <- input[!(tmp %in% select_fraction$tmp), ]
+ input$originalRUN <- input$newRun
+ input$RUN <- input$originalRUN
+ input$RUN <- factor(input$RUN, levels = unique(input$RUN), labels = seq(1, length(unique(input$RUN))))
+ input <- input[, !(colnames(input) %in% c("tmp", "newRun")), with = FALSE]
+ }
+ } else {
+ run_info <- unique(input[
+ ,
+ list(
+ GROUP_ORIGINAL, SUBJECT_ORIGINAL, RUN,
+ originalRUN, FRACTION
+ )
+ ])
+ match_runs <- try(
+ data.table::dcast(run_info,
+ GROUP_ORIGINAL + SUBJECT_ORIGINAL ~ FRACTION,
+ value.var = "originalRUN"
+ ),
+ silent = TRUE
+ )
+ if (inherits(match_runs, "try-error")) {
+ msg <- "*** error : can't figure out which multiple runs come from the same sample."
+ getOption("MSstatsLog")("ERROR", msg)
+ stop(msg)
+ } else {
+ match_runs[, merged := "merged"]
+ match_runs[, newRun := do.call(paste, c(.SD, sep = "_")),
+ .SDcols = c(1:3, ncol(match_runs))
+ ]
+ match_runs <- unique(match_runs[, list(
+ GROUP_ORIGINAL,
+ SUBJECT_ORIGINAL,
+ newRun
+ )])
+
+ input <- merge(input, match_runs,
+ by = c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL"),
+ all.x = TRUE
+ )
+ select_fraction <- input[!is.na(ABUNDANCE) & input$ABUNDANCE > 0,
+ list(ncount = .N),
+ by = c("FEATURE", "FRACTION")
+ ]
+ select_fraction <- select_fraction[ncount != 0]
+ select_fraction[, tmp := paste(FEATURE, FRACTION, sep = "_")]
+ input$tmp <- paste(input$FEATURE, input$FRACTION, sep = "_")
+ input <- input[tmp %in% select_fraction$tmp, ]
+ input$originalRUN <- input$newRun
+ input$RUN <- input$originalRUN
+ input$RUN <- factor(input$RUN,
+ levels = unique(input$RUN),
+ labels = seq_along(unique(input$RUN))
+ )
+ input <- input[, !(colnames(input) %in% c("tmp", "newRun")),
+ with = FALSE
+ ]
+ }
}
- input
+ }
+ input
}
diff --git a/R/utils_output.R b/R/utils_output.R
index 521fdfb1..0704c804 100644
--- a/R/utils_output.R
+++ b/R/utils_output.R
@@ -1,5 +1,5 @@
#' Post-processing output from MSstats summarization
-#'
+#'
#' @param input `data.table` in MSstats format
#' @param summarized output of the `MSstatsSummarizeWithSingleCore` function
#' @param processed output of MSstatsSelectFeatures
@@ -7,98 +7,119 @@
#' (`summaryMethod` parameter to `dataProcess`)
#' @param impute if TRUE, censored missing values were imputed
#' (`MBimpute` parameter to `dataProcess`)
-#' @param censored_symbol censored missing value indicator
+#' @param censored_symbol censored missing value indicator
#' (`censoredInt` parameter to `dataProcess`)
-#'
+#'
#' @return list that consists of the following elements:
#' \itemize{
-#' \item{FeatureLevelData}{ - feature-level data after processing}
+#' \item{FeatureLevelData}{ - feature-level data after processing}
#' \item{ProteinLevelData}{ - protein-level (summarized) data}
#' \item{SummaryMethod}{ (string) - name of summarization method that was used}
#' }
-#'
+#'
#' @export
-#'
+#'
#' @examples
-#' raw = DDARawData
-#' method = "TMP"
-#' cens = "NA"
-#' impute = TRUE
+#' raw <- DDARawData
+#' method <- "TMP"
+#' cens <- "NA"
+#' impute <- TRUE
#' MSstatsConvert::MSstatsLogsSettings(FALSE)
-#' input = MSstatsPrepareForDataProcess(raw, 2, NULL)
-#' input = MSstatsNormalize(input, "EQUALIZEMEDIANS")
-#' input = MSstatsMergeFractions(input)
-#' input = MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
-#' input = MSstatsSelectFeatures(input, "all")
-#' processed = getProcessed(input)
-#' input = MSstatsPrepareForSummarization(input, method, impute, cens, FALSE)
-#' summarized = MSstatsSummarizeWithSingleCore(input, method, impute, cens, FALSE, TRUE)
-#' output = output = MSstatsSummarizationOutput(input, summarized, processed,
-#' method, impute, cens)
-#'
-MSstatsSummarizationOutput = function(input, summarized, processed,
- method, impute, censored_symbol) {
- LABEL = TotalGroupMeasurements = GROUP = Protein = RUN = NULL
-
- input = .finalizeInput(input, summarized, method, impute, censored_symbol)
- summarized = lapply(summarized, function(x) x[[1]])
- summarized = data.table::rbindlist(summarized)
- if (inherits(summarized, "try-error")) {
- msg = paste("*** error : can't summarize per subplot with ",
- method, ".")
- getOption("MSstatsLog")("ERROR", msg)
- getOption("MSstatsMsg")("ERROR", msg)
- rqall = NULL
- rqmodelqc = NULL
- workpred = NULL
- } else {
- input[LABEL == "L", TotalGroupMeasurements := uniqueN(.SD),
- by = c("PROTEIN", "GROUP"),
- .SDcols = c("FEATURE", "originalRUN")]
- cols = intersect(c("PROTEIN", "originalRUN", "RUN", "GROUP",
- "GROUP_ORIGINAL", "SUBJECT_ORIGINAL",
- "TotalGroupMeasurements",
- "NumMeasuredFeature", "MissingPercentage",
- "more50missing", "NumImputedFeature"),
- colnames(input))
- merge_col = ifelse(is.element("RUN", colnames(summarized)),
- "RUN", "SUBJECT_ORIGINAL")
- lab = unique(input[LABEL == "L", cols, with = FALSE])
- if (nlevels(input$LABEL) > 1) {
- lab = lab[GROUP != 0]
- }
- lab = lab[, colnames(lab) != "GROUP", with = FALSE]
- rqall = merge(summarized, lab, by.x = c(merge_col, "Protein"),
- by.y = c(merge_col, "PROTEIN"))
- data.table::setnames(rqall, c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL"),
- c("GROUP", "SUBJECT"), skip_absent = TRUE)
-
- rqall$GROUP = factor(as.character(rqall$GROUP))
- rqall$Protein = factor(rqall$Protein)
- rqmodelqc = summarized$ModelQC
- }
-
- if (is.element("RUN", colnames(rqall)) & !is.null(rqall)) {
- rqall = rqall[order(Protein, as.numeric(as.character(RUN))), ]
- rownames(rqall) = NULL
- }
- output_cols = intersect(c("PROTEIN", "PEPTIDE", "TRANSITION", "FEATURE",
- "LABEL", "GROUP", "RUN", "SUBJECT", "FRACTION",
- "originalRUN", "censored", "INTENSITY", "ABUNDANCE",
- "newABUNDANCE", "predicted", "feature_quality",
- "is_outlier", "remove"), colnames(input))
- input = input[, output_cols, with = FALSE]
-
- if (is.element("remove", colnames(processed))) {
- processed = processed[(remove),
- intersect(output_cols,
- colnames(processed)), with = FALSE]
- input = rbind(input, processed, fill = TRUE)
+#' input <- MSstatsPrepareForDataProcess(raw, 2, NULL)
+#' input <- MSstatsNormalize(input, "EQUALIZEMEDIANS")
+#' input <- MSstatsMergeFractions(input)
+#' input <- MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
+#' input <- MSstatsSelectFeatures(input, "all")
+#' processed <- getProcessed(input)
+#' input <- MSstatsPrepareForSummarization(input, method, impute, cens, FALSE)
+#' summarized <- MSstatsSummarizeWithSingleCore(input, method, impute, cens, FALSE, TRUE)
+#' output <- output <- MSstatsSummarizationOutput(
+#' input, summarized, processed,
+#' method, impute, cens
+#' )
+#'
+MSstatsSummarizationOutput <- function(input, summarized, processed,
+ method, impute, censored_symbol) {
+ LABEL <- TotalGroupMeasurements <- GROUP <- Protein <- RUN <- NULL
+
+ input <- .finalizeInput(input, summarized, method, impute, censored_symbol)
+ summarized <- lapply(summarized, function(x) x[[1]])
+ summarized <- data.table::rbindlist(summarized)
+ if (inherits(summarized, "try-error")) {
+ msg <- paste(
+ "*** error : can't summarize per subplot with ",
+ method, "."
+ )
+ getOption("MSstatsLog")("ERROR", msg)
+ getOption("MSstatsMsg")("ERROR", msg)
+ rqall <- NULL
+ rqmodelqc <- NULL
+ workpred <- NULL
+ } else {
+ input[LABEL == "L", TotalGroupMeasurements := uniqueN(.SD),
+ by = c("PROTEIN", "GROUP"),
+ .SDcols = c("FEATURE", "originalRUN")
+ ]
+ cols <- intersect(
+ c(
+ "PROTEIN", "originalRUN", "RUN", "GROUP",
+ "GROUP_ORIGINAL", "SUBJECT_ORIGINAL",
+ "TotalGroupMeasurements",
+ "NumMeasuredFeature", "MissingPercentage",
+ "more50missing", "NumImputedFeature"
+ ),
+ colnames(input)
+ )
+ merge_col <- ifelse(is.element("RUN", colnames(summarized)),
+ "RUN", "SUBJECT_ORIGINAL"
+ )
+ lab <- unique(input[LABEL == "L", cols, with = FALSE])
+ if (nlevels(input$LABEL) > 1) {
+ lab <- lab[GROUP != 0]
}
- list(FeatureLevelData = as.data.frame(input),
- ProteinLevelData = as.data.frame(rqall),
- SummaryMethod = method)
-
+ lab <- lab[, colnames(lab) != "GROUP", with = FALSE]
+ rqall <- merge(summarized, lab,
+ by.x = c(merge_col, "Protein"),
+ by.y = c(merge_col, "PROTEIN")
+ )
+ data.table::setnames(rqall, c("GROUP_ORIGINAL", "SUBJECT_ORIGINAL"),
+ c("GROUP", "SUBJECT"),
+ skip_absent = TRUE
+ )
+
+ rqall$GROUP <- factor(as.character(rqall$GROUP))
+ rqall$Protein <- factor(rqall$Protein)
+ rqmodelqc <- summarized$ModelQC
+ }
+
+ if (is.element("RUN", colnames(rqall)) & !is.null(rqall)) {
+ rqall <- rqall[order(Protein, as.numeric(as.character(RUN))), ]
+ rownames(rqall) <- NULL
+ }
+ output_cols <- intersect(c(
+ "PROTEIN", "PEPTIDE", "TRANSITION", "FEATURE",
+ "LABEL", "GROUP", "RUN", "SUBJECT", "FRACTION",
+ "originalRUN", "censored", "INTENSITY", "ABUNDANCE",
+ "newABUNDANCE", "predicted", "feature_quality",
+ "is_outlier", "remove"
+ ), colnames(input))
+ input <- input[, output_cols, with = FALSE]
+
+ if (is.element("remove", colnames(processed))) {
+ processed <- processed[(remove),
+ intersect(
+ output_cols,
+ colnames(processed)
+ ),
+ with = FALSE
+ ]
+ input <- rbind(input, processed, fill = TRUE)
+ }
+ list(
+ FeatureLevelData = as.data.frame(input),
+ ProteinLevelData = as.data.frame(rqall),
+ SummaryMethod = method
+ )
}
@@ -109,79 +130,85 @@ MSstatsSummarizationOutput = function(input, summarized, processed,
#' @param impute if TRUE, censored missing values were imputed
#' @param censored_symbol censored missing value indicator
#' @keywords internal
-.finalizeInput = function(input, summarized, method, impute, censored_symbol) {
- if (method == "TMP") {
- input = .finalizeTMP(input, censored_symbol, impute, summarized)
- } else {
- input = .finalizeLinear(input, censored_symbol)
- }
- input
+.finalizeInput <- function(input, summarized, method, impute, censored_symbol) {
+ if (method == "TMP") {
+ input <- .finalizeTMP(input, censored_symbol, impute, summarized)
+ } else {
+ input <- .finalizeLinear(input, censored_symbol)
+ }
+ input
}
#' Summary statistics for output of TMP-based summarization
#' @inheritParams .finalizeInput
#' @keywords internal
-.finalizeTMP = function(input, censored_symbol, impute, summarized) {
- NonMissingStats = NumMeasuredFeature = MissingPercentage = LABEL = NULL
- total_features = more50missing = nonmissing_orig = censored = NULL
- INTENSITY = newABUNDANCE = NumImputedFeature = NULL
-
- survival_predictions = lapply(summarized, function(x) x[[2]])
- predicted_survival = data.table::rbindlist(survival_predictions)
- if (impute) {
- cols = intersect(colnames(input), c("newABUNDANCE",
- "cen", "RUN",
- "FEATURE", "ref"))
- input = merge(input[, colnames(input) != "newABUNDANCE", with = FALSE],
- predicted_survival,
- by = setdiff(cols, "newABUNDANCE"),
- all.x = TRUE)
+.finalizeTMP <- function(input, censored_symbol, impute, summarized) {
+ NonMissingStats <- NumMeasuredFeature <- MissingPercentage <- LABEL <- NULL
+ total_features <- more50missing <- nonmissing_orig <- censored <- NULL
+ INTENSITY <- newABUNDANCE <- NumImputedFeature <- NULL
+
+ survival_predictions <- lapply(summarized, function(x) x[[2]])
+ predicted_survival <- data.table::rbindlist(survival_predictions)
+ if (impute) {
+ cols <- intersect(colnames(input), c(
+ "newABUNDANCE",
+ "cen", "RUN",
+ "FEATURE", "ref"
+ ))
+ input <- merge(input[, colnames(input) != "newABUNDANCE", with = FALSE],
+ predicted_survival,
+ by = setdiff(cols, "newABUNDANCE"),
+ all.x = TRUE
+ )
+ }
+ input[, NonMissingStats := .getNonMissingFilterStats(.SD, censored_symbol)]
+ input[, NumMeasuredFeature := sum(NonMissingStats),
+ by = c("PROTEIN", "RUN")
+ ]
+ input[, MissingPercentage := 1 - (NumMeasuredFeature / total_features)]
+ input[, more50missing := MissingPercentage >= 0.5]
+ if (!is.null(censored_symbol)) {
+ if (is.element("censored", colnames(input))) {
+ input[, nonmissing_orig := LABEL == "L" & !censored]
+ } else {
+ input[, nonmissing_orig := LABEL == "L" & !is.na(INTENSITY)]
}
- input[, NonMissingStats := .getNonMissingFilterStats(.SD, censored_symbol)]
- input[, NumMeasuredFeature := sum(NonMissingStats),
- by = c("PROTEIN", "RUN")]
- input[, MissingPercentage := 1 - (NumMeasuredFeature / total_features)]
- input[, more50missing := MissingPercentage >= 0.5]
- if (!is.null(censored_symbol)) {
- if (is.element("censored", colnames(input))) {
- input[, nonmissing_orig := LABEL == "L" & !censored]
- } else {
- input[, nonmissing_orig := LABEL == "L" & !is.na(INTENSITY)]
- }
- input[, nonmissing_orig := ifelse(is.na(newABUNDANCE), TRUE, nonmissing_orig)]
- if (impute) {
- input[, NumImputedFeature := sum(LABEL == "L" & !nonmissing_orig),
- by = c("PROTEIN", "RUN")]
- } else {
- input[, NumImputedFeature := 0]
- }
+ input[, nonmissing_orig := ifelse(is.na(newABUNDANCE), TRUE, nonmissing_orig)]
+ if (impute) {
+ input[, NumImputedFeature := sum(LABEL == "L" & !nonmissing_orig),
+ by = c("PROTEIN", "RUN")
+ ]
+ } else {
+ input[, NumImputedFeature := 0]
}
- input
+ }
+ input
}
#' Summary statistics for linear model-based summarization
#' @inheritParams .finalizeInput
#' @keywords internal
-.finalizeLinear = function(input, censored_symbol) {
- NonMissingStats = NumMeasuredFeature = MissingPercentage = NULL
- total_features = more50missing = nonmissing_orig = LABEL = NULL
- censored = INTENSITY = newABUNDANCE = NumImputedFeature = NULL
-
- input[, NonMissingStats := .getNonMissingFilterStats(.SD, censored_symbol)]
- input[, NumMeasuredFeature := sum(NonMissingStats),
- by = c("PROTEIN", "RUN")]
- input[, MissingPercentage := 1 - (NumMeasuredFeature / total_features)]
- input[, more50missing := MissingPercentage >= 0.5]
- if (!is.null(censored_symbol)) {
- if (is.element("censored", colnames(input))) {
- input[, nonmissing_orig := LABEL == "L" & !censored]
- } else {
- input[, nonmissing_orig := LABEL == "L" & !is.na(INTENSITY)]
- }
- input[, nonmissing_orig := ifelse(is.na(newABUNDANCE), TRUE, nonmissing_orig)]
- input[, NumImputedFeature := 0]
+.finalizeLinear <- function(input, censored_symbol) {
+ NonMissingStats <- NumMeasuredFeature <- MissingPercentage <- NULL
+ total_features <- more50missing <- nonmissing_orig <- LABEL <- NULL
+ censored <- INTENSITY <- newABUNDANCE <- NumImputedFeature <- NULL
+
+ input[, NonMissingStats := .getNonMissingFilterStats(.SD, censored_symbol)]
+ input[, NumMeasuredFeature := sum(NonMissingStats),
+ by = c("PROTEIN", "RUN")
+ ]
+ input[, MissingPercentage := 1 - (NumMeasuredFeature / total_features)]
+ input[, more50missing := MissingPercentage >= 0.5]
+ if (!is.null(censored_symbol)) {
+ if (is.element("censored", colnames(input))) {
+ input[, nonmissing_orig := LABEL == "L" & !censored]
+ } else {
+ input[, nonmissing_orig := LABEL == "L" & !is.na(INTENSITY)]
}
- input
+ input[, nonmissing_orig := ifelse(is.na(newABUNDANCE), TRUE, nonmissing_orig)]
+ input[, NumImputedFeature := 0]
+ }
+ input
}
diff --git a/R/utils_plots_common.R b/R/utils_plots_common.R
index 68876574..c90e8701 100644
--- a/R/utils_plots_common.R
+++ b/R/utils_plots_common.R
@@ -1,5 +1,5 @@
#' Theme for MSstats plots
-#'
+#'
#' @param type type of a plot
#' @param x.axis.size size of text on the x axis
#' @param y.axis.size size of text on the y axis
@@ -12,121 +12,128 @@
#' @param text_hjust hjust parameter for x axis text (for condition and comparison plots)
#' @param text_vjust vjust parameter for x axis text (for condition and comparison plots)
#' @param ... additional parameters passed on to ggplot2::theme()
-#'
+#'
#' @import ggplot2
#' @export
-#'
-theme_msstats = function(
- type, x.axis.size = 10, y.axis.size = 10, legend_size = 13,
+#'
+theme_msstats <- function(
+ type, x.axis.size = 10, y.axis.size = 10, legend_size = 13,
strip_background = element_rect(fill = "gray95"),
strip_text_x = element_text(colour = c("black"), size = 14),
- legend_position = "top", legend_box = "vertical", text_angle = 0, text_hjust = NULL, text_vjust = NULL,
- ...
-) {
- if (type %in% c("CONDITIONPLOT", "COMPARISONPLOT")) {
- ggplot2::theme(
- panel.background = element_rect(fill = 'white', colour = "black"),
- axis.title.x = element_text(size = x.axis.size + 5, vjust = -0.4),
- axis.title.y = element_text(size = y.axis.size + 5, vjust = 0.3),
- axis.ticks = element_line(colour = "black"),
- title = element_text(size = x.axis.size + 8, vjust = 1.5),
- panel.grid.major.y = element_line(colour = "grey95"),
- panel.grid.minor.y = element_blank(),
- axis.text.y = element_text(size = y.axis.size, colour = "black"),
- axis.text.x = element_text(size = x.axis.size, colour = "black",
- angle = text_angle, hjust = text_hjust,
- vjust = text_vjust),
- ...
- )
- } else {
- ggplot2::theme(
- panel.background = element_rect(fill = 'white', colour = "black"),
- legend.key = element_rect(fill = 'white', colour = 'white'),
- panel.grid.minor = element_blank(),
- strip.background = strip_background,
- axis.text.x = element_text(size = x.axis.size, colour = "black"),
- axis.text.y = element_text(size = y.axis.size, colour = "black"),
- axis.title.x = element_text(size = x.axis.size + 5, vjust = -0.4),
- axis.title.y = element_text(size = y.axis.size + 5, vjust = 0.3),
- axis.ticks = element_line(colour = "black"),
- title = element_text(size = x.axis.size + 8, vjust = 1.5),
- strip.text.x = strip_text_x,
- legend.position = legend_position,
- legend.box = legend_box,
- legend.text = element_text(size = legend_size),
- ...
- )
- }
+ legend_position = "top", legend_box = "vertical", text_angle = 0, text_hjust = NULL, text_vjust = NULL,
+ ...) {
+ if (type %in% c("CONDITIONPLOT", "COMPARISONPLOT")) {
+ ggplot2::theme(
+ panel.background = element_rect(fill = "white", colour = "black"),
+ axis.title.x = element_text(size = x.axis.size + 5, vjust = -0.4),
+ axis.title.y = element_text(size = y.axis.size + 5, vjust = 0.3),
+ axis.ticks = element_line(colour = "black"),
+ title = element_text(size = x.axis.size + 8, vjust = 1.5),
+ panel.grid.major.y = element_line(colour = "grey95"),
+ panel.grid.minor.y = element_blank(),
+ axis.text.y = element_text(size = y.axis.size, colour = "black"),
+ axis.text.x = element_text(
+ size = x.axis.size, colour = "black",
+ angle = text_angle, hjust = text_hjust,
+ vjust = text_vjust
+ ),
+ ...
+ )
+ } else {
+ ggplot2::theme(
+ panel.background = element_rect(fill = "white", colour = "black"),
+ legend.key = element_rect(fill = "white", colour = "white"),
+ panel.grid.minor = element_blank(),
+ strip.background = strip_background,
+ axis.text.x = element_text(size = x.axis.size, colour = "black"),
+ axis.text.y = element_text(size = y.axis.size, colour = "black"),
+ axis.title.x = element_text(size = x.axis.size + 5, vjust = -0.4),
+ axis.title.y = element_text(size = y.axis.size + 5, vjust = 0.3),
+ axis.ticks = element_line(colour = "black"),
+ title = element_text(size = x.axis.size + 8, vjust = 1.5),
+ strip.text.x = strip_text_x,
+ legend.position = legend_position,
+ legend.box = legend_box,
+ legend.text = element_text(size = legend_size),
+ ...
+ )
+ }
}
#' Get proteins based on names or integer IDs
-#'
+#'
#' @param chosen_proteins protein names or integers IDs
#' @param all_proteins all unique proteins
-#'
+#'
#' @return character
-#'
+#'
#' @export
-getSelectedProteins = function(chosen_proteins, all_proteins) {
- if (is.character(chosen_proteins)) {
- selected_proteins = chosen_proteins
- missing_proteins = setdiff(selected_proteins, all_proteins)
- if (length(missing_proteins) > 0) {
- stop(paste("Please check protein name. Dataset does not have this protein. -",
- toString(missing_proteins), sep = " "))
- }
+getSelectedProteins <- function(chosen_proteins, all_proteins) {
+ if (is.character(chosen_proteins)) {
+ selected_proteins <- chosen_proteins
+ missing_proteins <- setdiff(selected_proteins, all_proteins)
+ if (length(missing_proteins) > 0) {
+ stop(paste("Please check protein name. Dataset does not have this protein. -",
+ toString(missing_proteins),
+ sep = " "
+ ))
}
- if (is.numeric(chosen_proteins)) {
- selected_proteins <- all_proteins[chosen_proteins]
- if (length(all_proteins) < max(chosen_proteins)) {
- stop(paste("Please check your selection of proteins. There are ",
- length(all_proteins)," proteins in this dataset."))
- }
+ }
+ if (is.numeric(chosen_proteins)) {
+ selected_proteins <- all_proteins[chosen_proteins]
+ if (length(all_proteins) < max(chosen_proteins)) {
+ stop(paste(
+ "Please check your selection of proteins. There are ",
+ length(all_proteins), " proteins in this dataset."
+ ))
}
- selected_proteins
+ }
+ selected_proteins
}
#' Save a plot to pdf file
-#'
+#'
#' @inheritParams .saveTable
#' @param width width of a plot
#' @param height height of a plot
-#'
+#'
#' @return NULL
-#'
+#'
#' @export
-#'
-savePlot = function(name_base, file_name, width, height) {
- if (name_base != FALSE) {
- width_inches = .convertPixelsToInches(width)
- height_inches = .convertPixelsToInches(height)
- file_path = getFileName(name_base, file_name,
- width, height)
- file_path = paste0(file_path,".pdf")
- pdf(file_path, width = width_inches, height = height_inches)
- }
- NULL
+#'
+savePlot <- function(name_base, file_name, width, height) {
+ if (name_base != FALSE) {
+ width_inches <- .convertPixelsToInches(width)
+ height_inches <- .convertPixelsToInches(height)
+ file_path <- getFileName(
+ name_base, file_name,
+ width, height
+ )
+ file_path <- paste0(file_path, ".pdf")
+ pdf(file_path, width = width_inches, height = height_inches)
+ }
+ NULL
}
-.convertPixelsToInches = function(pixels) {
- # Convert pixels to inches (using standard 72 DPI)
- inches = pixels / 72
- return(inches)
+.convertPixelsToInches <- function(pixels) {
+ # Convert pixels to inches (using standard 72 DPI)
+ inches <- pixels / 72
+ return(inches)
}
-getFileName = function(name_base, file_name, width, height) {
- all_files = list.files(".")
- if(file_name == 'ProfilePlot'){
- num_same_name = sum(grepl(paste0("^", name_base, file_name, "_[0-9]?"), all_files))
- } else {
- num_same_name = sum(grepl(paste0("^", name_base, file_name, "[0-9]?"), all_files))
- }
- if (num_same_name > 0) {
- file_name = paste(file_name, num_same_name + 1, sep = "_")
- }
- file_path = paste0(name_base, file_name)
- return(file_path)
+getFileName <- function(name_base, file_name, width, height) {
+ all_files <- list.files(".")
+ if (file_name == "ProfilePlot") {
+ num_same_name <- sum(grepl(paste0("^", name_base, file_name, "_[0-9]?"), all_files))
+ } else {
+ num_same_name <- sum(grepl(paste0("^", name_base, file_name, "[0-9]?"), all_files))
+ }
+ if (num_same_name > 0) {
+ file_name <- paste(file_name, num_same_name + 1, sep = "_")
+ }
+ file_path <- paste0(name_base, file_name)
+ return(file_path)
}
@@ -137,15 +144,15 @@ getFileName = function(name_base, file_name, width, height) {
#' an integer will be appended to this name
#' @return NULL
#' @keywords internal
-.saveTable = function(input, name_base, file_name) {
- if (name_base != FALSE) {
- all_files = list.files(".")
- num_same_name = sum(grepl(paste0("^", file_name, "_[0-9]?"), all_files))
- if (num_same_name > 0) {
- file_name = paste(file_name, num_same_name + 1, sep = "_")
- }
- file_path = paste0(paste0(name_base, "/"), file_name, ".pdf")
- data.table::fwrite(input, file = file_path)
+.saveTable <- function(input, name_base, file_name) {
+ if (name_base != FALSE) {
+ all_files <- list.files(".")
+ num_same_name <- sum(grepl(paste0("^", file_name, "_[0-9]?"), all_files))
+ if (num_same_name > 0) {
+ file_name <- paste(file_name, num_same_name + 1, sep = "_")
}
- NULL
+ file_path <- paste0(paste0(name_base, "/"), file_name, ".pdf")
+ data.table::fwrite(input, file = file_path)
+ }
+ NULL
}
diff --git a/R/utils_statistics.R b/R/utils_statistics.R
index 24319426..fe52e81b 100644
--- a/R/utils_statistics.R
+++ b/R/utils_statistics.R
@@ -2,11 +2,11 @@
#' @param input data.table
#' @return TRUE invisibly after successful logging
#' @keywords internal
-.logDatasetInformation = function(input) {
- .logSummaryStatistics(input)
- .checkSingleLabelProteins(input)
- .logMissingness(input)
- invisible(TRUE)
+.logDatasetInformation <- function(input) {
+ .logSummaryStatistics(input)
+ .checkSingleLabelProteins(input)
+ .logMissingness(input)
+ invisible(TRUE)
}
@@ -14,81 +14,103 @@
#' @param input data.table
#' @return TRUE invisibly
#' @keywords internal
-.logSummaryStatistics = function(input) {
- RUN = SUBJECT_ORIGINAL = FRACTION = GROUP_ORIGINAL = NumRuns = NULL
- NumBioReplicates = NumFractions = NULL
-
- PEPTIDE = FEATURE = PROTEIN = feature_count = NULL
-
- num_proteins = data.table::uniqueN(input$PROTEIN)
- peptides_per_protein = input[, list(peptide_count = data.table::uniqueN(PEPTIDE)),
- by = "PROTEIN"]
- unique_peptide_feature_pairs = unique(input[, .(PEPTIDE, FEATURE)])
- features_per_peptide = unique_peptide_feature_pairs[, .(feature_count = .N),
- by = PEPTIDE]
- features_per_protein = input[, list(feature_count = data.table::uniqueN(FEATURE)),
- by = "PROTEIN"]
- features_per_protein = features_per_protein[feature_count == 1L, PROTEIN]
-
- pep_per_prot = range(peptides_per_protein$peptide_count)
- feat_per_pept = range(features_per_peptide$feature_count)
- counts_msg = paste("", paste("# proteins:", num_proteins),
- paste("# peptides per protein:",
- paste(pep_per_prot, sep = "-", collapse = "-")),
- paste("# features per peptide:",
- paste(feat_per_pept, sep = "-", collapse = "-")),
- sep = "\n ", collapse = "\n ")
- getOption("MSstatsMsg")("INFO", counts_msg)
- getOption("MSstatsLog")("INFO", counts_msg)
-
- if (length(features_per_protein) > 0) {
- single_features = unique(as.character(features_per_protein))
- n_feat = min(length(single_features), 5)
- msg = paste("Some proteins have only one feature:", "\n",
- paste(unique(as.character(features_per_protein))[1:n_feat],
- sep = ",\n ", collapse = ",\n "),
- "...")
- getOption("MSstatsLog")("INFO", msg)
- getOption("MSstatsMsg")("INFO", msg)
- }
-
- samples_info = input[, list(NumRuns = data.table::uniqueN(RUN),
- NumBioReplicates = data.table::uniqueN(SUBJECT_ORIGINAL),
- NumFractions = data.table::uniqueN(FRACTION)),
- by = "GROUP_ORIGINAL"]
- samples_info = samples_info[
- ,
- list(GROUP_ORIGINAL, NumRuns, NumBioReplicates,
- NumTechReplicates = as.integer(
- round(NumRuns / (NumBioReplicates * NumFractions)
- )))]
- samples_info = data.table::dcast(data.table::melt(samples_info,
- id.vars = "GROUP_ORIGINAL"),
- variable ~ GROUP_ORIGINAL)
- colnames(samples_info)[1] = ""
- samples_info[, 1] = c("# runs", "# bioreplicates", "# tech. replicates")
-
- samples_info = rbind(data.table::data.table(t(colnames(samples_info))),
- samples_info, use.names = FALSE)
- samples_msg = apply(samples_info, 2, .nicePrint)
- samples_msg = apply(samples_msg, 1, function(x) paste0(x, collapse = ""))
- samples_msg = paste("", samples_msg, sep = "\n")
- getOption("MSstatsLog")("INFO", samples_msg)
- getOption("MSstatsMsg")("INFO", samples_msg)
- invisible(TRUE)
+.logSummaryStatistics <- function(input) {
+ RUN <- SUBJECT_ORIGINAL <- FRACTION <- GROUP_ORIGINAL <- NumRuns <- NULL
+ NumBioReplicates <- NumFractions <- NULL
+
+ PEPTIDE <- FEATURE <- PROTEIN <- feature_count <- NULL
+
+ num_proteins <- data.table::uniqueN(input$PROTEIN)
+ peptides_per_protein <- input[, list(peptide_count = data.table::uniqueN(PEPTIDE)),
+ by = "PROTEIN"
+ ]
+ unique_peptide_feature_pairs <- unique(input[, .(PEPTIDE, FEATURE)])
+ features_per_peptide <- unique_peptide_feature_pairs[, .(feature_count = .N),
+ by = PEPTIDE
+ ]
+ features_per_protein <- input[, list(feature_count = data.table::uniqueN(FEATURE)),
+ by = "PROTEIN"
+ ]
+ features_per_protein <- features_per_protein[feature_count == 1L, PROTEIN]
+
+ pep_per_prot <- range(peptides_per_protein$peptide_count)
+ feat_per_pept <- range(features_per_peptide$feature_count)
+ counts_msg <- paste("", paste("# proteins:", num_proteins),
+ paste(
+ "# peptides per protein:",
+ paste(pep_per_prot, sep = "-", collapse = "-")
+ ),
+ paste(
+ "# features per peptide:",
+ paste(feat_per_pept, sep = "-", collapse = "-")
+ ),
+ sep = "\n ", collapse = "\n "
+ )
+ getOption("MSstatsMsg")("INFO", counts_msg)
+ getOption("MSstatsLog")("INFO", counts_msg)
+
+ if (length(features_per_protein) > 0) {
+ single_features <- unique(as.character(features_per_protein))
+ n_feat <- min(length(single_features), 5)
+ msg <- paste(
+ "Some proteins have only one feature:", "\n",
+ paste(unique(as.character(features_per_protein))[1:n_feat],
+ sep = ",\n ", collapse = ",\n "
+ ),
+ "..."
+ )
+ getOption("MSstatsLog")("INFO", msg)
+ getOption("MSstatsMsg")("INFO", msg)
+ }
+
+ samples_info <- input[, list(
+ NumRuns = data.table::uniqueN(RUN),
+ NumBioReplicates = data.table::uniqueN(SUBJECT_ORIGINAL),
+ NumFractions = data.table::uniqueN(FRACTION)
+ ),
+ by = "GROUP_ORIGINAL"
+ ]
+ samples_info <- samples_info[
+ ,
+ list(GROUP_ORIGINAL, NumRuns, NumBioReplicates,
+ NumTechReplicates = as.integer(
+ round(NumRuns / (NumBioReplicates * NumFractions))
+ )
+ )
+ ]
+ samples_info <- data.table::dcast(
+ data.table::melt(samples_info,
+ id.vars = "GROUP_ORIGINAL"
+ ),
+ variable ~ GROUP_ORIGINAL
+ )
+ colnames(samples_info)[1] <- ""
+ samples_info[, 1] <- c("# runs", "# bioreplicates", "# tech. replicates")
+
+ samples_info <- rbind(data.table::data.table(t(colnames(samples_info))),
+ samples_info,
+ use.names = FALSE
+ )
+ samples_msg <- apply(samples_info, 2, .nicePrint)
+ samples_msg <- apply(samples_msg, 1, function(x) paste0(x, collapse = ""))
+ samples_msg <- paste("", samples_msg, sep = "\n")
+ getOption("MSstatsLog")("INFO", samples_msg)
+ getOption("MSstatsMsg")("INFO", samples_msg)
+ invisible(TRUE)
}
-#' Print a table nicely
+#' Print a table nicely
#' @param string_vector character
#' @return character
#' @keywords internal
-.nicePrint = function(string_vector) {
- max_chars = max(nchar(string_vector))
- string_vector = sapply(string_vector,
- function(x) paste(paste0(rep(" ", max_chars - nchar(x)), collapse = ""), x),
- USE.NAMES = FALSE)
-}
+.nicePrint <- function(string_vector) {
+ max_chars <- max(nchar(string_vector))
+ string_vector <- sapply(string_vector,
+ function(x) paste(paste0(rep(" ", max_chars - nchar(x)), collapse = ""), x),
+ USE.NAMES = FALSE
+ )
+}
#' Print proteins with a single label to the log file
@@ -96,22 +118,26 @@
#' @param label label ("L" or "H")
#' @return TRUE invisibly
#' @keywords internal
-.logSingleLabeledProteins = function(input, label) {
- LABEL = PROTEIN = NULL
-
- name = ifelse(label == "L", "endogeneous", "reference")
- proteins = unique(input[LABEL == label, as.character(PROTEIN)])
- if (length(proteins) > 0) {
- n_prot = min(length(proteins), 5)
- msg = paste(paste("Some proteins only have", name,
- "intensities in label-based experiment",
- "Please check or remove these proteins:"),
- paste(proteins[1:n_prot], sep = ", \n ", collapse = ", \n "),
- "... (see the log file for a full list)")
- getOption("MSstatsMsg")("WARN", msg)
- getOption("MsstatsLog")("WARN", msg)
- }
- invisible(TRUE)
+.logSingleLabeledProteins <- function(input, label) {
+ LABEL <- PROTEIN <- NULL
+
+ name <- ifelse(label == "L", "endogeneous", "reference")
+ proteins <- unique(input[LABEL == label, as.character(PROTEIN)])
+ if (length(proteins) > 0) {
+ n_prot <- min(length(proteins), 5)
+ msg <- paste(
+ paste(
+ "Some proteins only have", name,
+ "intensities in label-based experiment",
+ "Please check or remove these proteins:"
+ ),
+ paste(proteins[1:n_prot], sep = ", \n ", collapse = ", \n "),
+ "... (see the log file for a full list)"
+ )
+ getOption("MSstatsMsg")("WARN", msg)
+ getOption("MsstatsLog")("WARN", msg)
+ }
+ invisible(TRUE)
}
@@ -119,20 +145,22 @@
#' @param input data.table
#' @return TRUE invisibly
#' @keywords internal
-.checkSingleLabelProteins = function(input) {
- LABEL = label_count = NULL
-
- if (data.table::uniqueN(input$LABEL) == 2) {
- labels_by_protein = unique(input[,
- list(LABEL,
- label_count = data.table::uniqueN(LABEL)),
- by = "PROTEIN"])
- labels_by_protein = labels_by_protein[label_count == 1L, ]
-
- .logSingleLabeledProteins(labels_by_protein, "L")
- .logSingleLabeledProteins(labels_by_protein, "H")
- }
- invisible(TRUE)
+.checkSingleLabelProteins <- function(input) {
+ LABEL <- label_count <- NULL
+
+ if (data.table::uniqueN(input$LABEL) == 2) {
+ labels_by_protein <- unique(input[,
+ list(LABEL,
+ label_count = data.table::uniqueN(LABEL)
+ ),
+ by = "PROTEIN"
+ ])
+ labels_by_protein <- labels_by_protein[label_count == 1L, ]
+
+ .logSingleLabeledProteins(labels_by_protein, "L")
+ .logSingleLabeledProteins(labels_by_protein, "H")
+ }
+ invisible(TRUE)
}
@@ -140,47 +168,56 @@
#' @param input data.table
#' @return TRUE invisibly
#' @keywords internal
-.logMissingness = function(input) {
- ABUNDANCE = AllMissing = NumMissing = NumTotal = AnyAllMissing = NULL
- FEATURE = FractionMissing = RUN = NULL
-
- input[, is_na_abundance := is.na(ABUNDANCE)]
- missing = input[, .(
- NumMissing = sum(is_na_abundance),
- NumTotal = .N
- ), by = .(LABEL, GROUP, FEATURE)]
- missing[, AllMissing := (NumMissing == NumTotal)]
- any_all_missing = missing[AllMissing == TRUE,
- .(AnyAllMissing = TRUE),
- by = .(LABEL, FEATURE)]
- missing = merge(missing, any_all_missing,
- by = c("LABEL", "FEATURE"), all.x = TRUE)
- missing[is.na(AnyAllMissing), AnyAllMissing := FALSE]
- missing_in_any = as.character(missing[(AnyAllMissing), FEATURE])
- missing_by_run = input[, .(
- NumMissing = sum(is_na_abundance),
- NumTotal = .N
- ), by = "RUN"]
- missing_by_run[, FractionMissing := NumMissing / NumTotal]
- missing_by_run = as.character(missing_by_run[FractionMissing > 0.75,
- unique(RUN)])
-
- if (length(missing_in_any) > 0) {
- num_features = min(length(missing_in_any), 5)
- msg = paste("Some features are completely",
- "missing in at least one condition: ", "\n",
- paste(unique(as.character(missing_in_any))[1:num_features],
- sep = ",\n ",
- collapse = ",\n "), "...")
- getOption("MSstatsLog")("INFO", msg)
- getOption("MSstatsMsg")("INFO", msg)
- }
-
- if (length(missing_by_run) > 0) {
- msg = paste("The following runs have more than 75% missing values:",
- paste(missing_by_run, sep = ",\n ", collapse = ",\n "))
- getOption("MSstatsLog")("INFO", msg)
- getOption("MSstatsMsg")("INFO", msg)
- }
- invisible(TRUE)
+.logMissingness <- function(input) {
+ ABUNDANCE <- AllMissing <- NumMissing <- NumTotal <- AnyAllMissing <- NULL
+ FEATURE <- FractionMissing <- RUN <- NULL
+
+ input[, is_na_abundance := is.na(ABUNDANCE)]
+ missing <- input[, .(
+ NumMissing = sum(is_na_abundance),
+ NumTotal = .N
+ ), by = .(LABEL, GROUP, FEATURE)]
+ missing[, AllMissing := (NumMissing == NumTotal)]
+ any_all_missing <- missing[AllMissing == TRUE,
+ .(AnyAllMissing = TRUE),
+ by = .(LABEL, FEATURE)
+ ]
+ missing <- merge(missing, any_all_missing,
+ by = c("LABEL", "FEATURE"), all.x = TRUE
+ )
+ missing[is.na(AnyAllMissing), AnyAllMissing := FALSE]
+ missing_in_any <- as.character(missing[(AnyAllMissing), FEATURE])
+ missing_by_run <- input[, .(
+ NumMissing = sum(is_na_abundance),
+ NumTotal = .N
+ ), by = "RUN"]
+ missing_by_run[, FractionMissing := NumMissing / NumTotal]
+ missing_by_run <- as.character(missing_by_run[
+ FractionMissing > 0.75,
+ unique(RUN)
+ ])
+
+ if (length(missing_in_any) > 0) {
+ num_features <- min(length(missing_in_any), 5)
+ msg <- paste(
+ "Some features are completely",
+ "missing in at least one condition: ", "\n",
+ paste(unique(as.character(missing_in_any))[1:num_features],
+ sep = ",\n ",
+ collapse = ",\n "
+ ), "..."
+ )
+ getOption("MSstatsLog")("INFO", msg)
+ getOption("MSstatsMsg")("INFO", msg)
+ }
+
+ if (length(missing_by_run) > 0) {
+ msg <- paste(
+ "The following runs have more than 75% missing values:",
+ paste(missing_by_run, sep = ",\n ", collapse = ",\n ")
+ )
+ getOption("MSstatsLog")("INFO", msg)
+ getOption("MSstatsMsg")("INFO", msg)
+ }
+ invisible(TRUE)
}
diff --git a/R/utils_summarization.R b/R/utils_summarization.R
index ec7433d9..82b8b090 100644
--- a/R/utils_summarization.R
+++ b/R/utils_summarization.R
@@ -3,60 +3,72 @@
#' @param remove50missing if TRUE, proteins with more than 50% missing values
#' in all runs will not be summarized
#' @return data.table
-#' @keywords internal
-.isSummarizable = function(input, remove50missing) {
- n_obs_run = RUN = NULL
-
- if (all(is.na(input$newABUNDANCE) | input$newABUNDANCE == 0)) {
- msg = paste("Can't summarize for protein", unique(input$PROTEIN),
- "because all measurements are missing or censored.")
- getOption("MSstatsMsg")("INFO", msg)
- getOption("MSstatsLog")("INFO", msg)
- return(NULL)
- }
-
- if (all(is.na(input$n_obs) | input$n_obs == 0)) {
- msg = paste("Can't summarize for protein", unique(input$PROTEIN),
- "because all measurements are missing or censored.")
- getOption("MSstatsMsg")("INFO", msg)
- getOption("MSstatsLog")("INFO", msg)
- return(NULL)
- }
-
- if (all(input$n_obs == 1 | is.na(input$n_obs))) {
- msg = paste("Can't summarize for protein", unique(input$PROTEIN),
- "because features have only one measurement across MS runs.")
- getOption("MSstatsMsg")("INFO", msg)
- getOption("MSstatsLog")("INFO", msg)
- return(NULL)
- }
-
- if (all(is.na(input$newABUNDANCE) | input$newABUNDANCE == 0) | nrow(input) == 0) {
- msg = paste("After removing features which has only 1 measurement,",
- "Can't summarize for protein", unique(input$PROTEIN),
- "because all measurements are missing or censored.")
- getOption("MSstatsMsg")("INFO", msg)
- getOption("MSstatsLog")("INFO", msg)
- return(NULL)
- }
-
- missing_runs = setdiff(unique(input$RUN),
- unique(input[n_obs_run == 0 | is.na(n_obs_run), RUN]))
- if (length(missing_runs) > 0 & length(intersect(missing_runs, as.character(unique(input$RUN))))) {
- input = input[n_obs_run > 0 & !is.na(n_obs_run), ]
- }
-
- if (remove50missing) {
- if (all(input$prop_features <= 0.5 | is.na(input$prop_features))) {
- msg = paste("Can't summarize for protein", unique(input$PROTEIN),
- "because all runs have more than 50% missing values and",
- "are removed with the option, remove50missing=TRUE.")
- getOption("MSstatsMsg")("INFO", msg)
- getOption("MSstatsLog")("INFO", msg)
- return(NULL)
- }
+#' @keywords internal
+.isSummarizable <- function(input, remove50missing) {
+ n_obs_run <- RUN <- NULL
+
+ if (all(is.na(input$newABUNDANCE) | input$newABUNDANCE == 0)) {
+ msg <- paste(
+ "Can't summarize for protein", unique(input$PROTEIN),
+ "because all measurements are missing or censored."
+ )
+ getOption("MSstatsMsg")("INFO", msg)
+ getOption("MSstatsLog")("INFO", msg)
+ return(NULL)
+ }
+
+ if (all(is.na(input$n_obs) | input$n_obs == 0)) {
+ msg <- paste(
+ "Can't summarize for protein", unique(input$PROTEIN),
+ "because all measurements are missing or censored."
+ )
+ getOption("MSstatsMsg")("INFO", msg)
+ getOption("MSstatsLog")("INFO", msg)
+ return(NULL)
+ }
+
+ if (all(input$n_obs == 1 | is.na(input$n_obs))) {
+ msg <- paste(
+ "Can't summarize for protein", unique(input$PROTEIN),
+ "because features have only one measurement across MS runs."
+ )
+ getOption("MSstatsMsg")("INFO", msg)
+ getOption("MSstatsLog")("INFO", msg)
+ return(NULL)
+ }
+
+ if (all(is.na(input$newABUNDANCE) | input$newABUNDANCE == 0) | nrow(input) == 0) {
+ msg <- paste(
+ "After removing features which has only 1 measurement,",
+ "Can't summarize for protein", unique(input$PROTEIN),
+ "because all measurements are missing or censored."
+ )
+ getOption("MSstatsMsg")("INFO", msg)
+ getOption("MSstatsLog")("INFO", msg)
+ return(NULL)
+ }
+
+ missing_runs <- setdiff(
+ unique(input$RUN),
+ unique(input[n_obs_run == 0 | is.na(n_obs_run), RUN])
+ )
+ if (length(missing_runs) > 0 & length(intersect(missing_runs, as.character(unique(input$RUN))))) {
+ input <- input[n_obs_run > 0 & !is.na(n_obs_run), ]
+ }
+
+ if (remove50missing) {
+ if (all(input$prop_features <= 0.5 | is.na(input$prop_features))) {
+ msg <- paste(
+ "Can't summarize for protein", unique(input$PROTEIN),
+ "because all runs have more than 50% missing values and",
+ "are removed with the option, remove50missing=TRUE."
+ )
+ getOption("MSstatsMsg")("INFO", msg)
+ getOption("MSstatsLog")("INFO", msg)
+ return(NULL)
}
- input
+ }
+ input
}
@@ -66,21 +78,23 @@
#' @inheritParams MSstatsSummarizeWithSingleCore
#' @return data.table
#' @keywords internal
-.runTukey = function(input, is_labeled, censored_symbol, remove50missing) {
- Protein = RUN = newABUNDANCE = NULL
-
- if (nlevels(input$FEATURE) > 1) {
- tmp_result = .fitTukey(input)
- } else {
- if (is_labeled) {
- tmp_result = .adjustLRuns(input, TRUE)
- } else {
- tmp_result = input[input$LABEL == "L",
- list(RUN, LogIntensities = newABUNDANCE)]
- }
+.runTukey <- function(input, is_labeled, censored_symbol, remove50missing) {
+ Protein <- RUN <- newABUNDANCE <- NULL
+
+ if (nlevels(input$FEATURE) > 1) {
+ tmp_result <- .fitTukey(input)
+ } else {
+ if (is_labeled) {
+ tmp_result <- .adjustLRuns(input, TRUE)
+ } else {
+ tmp_result <- input[
+ input$LABEL == "L",
+ list(RUN, LogIntensities = newABUNDANCE)
+ ]
}
- tmp_result[, Protein := unique(input$PROTEIN)]
- tmp_result
+ }
+ tmp_result[, Protein := unique(input$PROTEIN)]
+ tmp_result
}
@@ -88,20 +102,22 @@
#' @inheritParams .runTukey
#' @return data.table
#' @keywords internal
-.fitTukey = function(input) {
- LABEL = RUN = newABUNDANCE = NULL
-
- features = as.character(unique(input$FEATURE))
- wide = data.table::dcast(LABEL + RUN ~ FEATURE, data = input,
- value.var = "newABUNDANCE", keep = TRUE)
- tmp_fitted = median_polish_summary(as.matrix(wide[, features, with = FALSE]))
- wide[, newABUNDANCE := tmp_fitted]
- tmp_result = wide[, list(LABEL, RUN, newABUNDANCE)]
-
- if (data.table::uniqueN(input$LABEL) == 2) {
- tmp_result = .adjustLRuns(tmp_result)
- }
- tmp_result[, list(RUN, LogIntensities = newABUNDANCE)]
+.fitTukey <- function(input) {
+ LABEL <- RUN <- newABUNDANCE <- NULL
+
+ features <- as.character(unique(input$FEATURE))
+ wide <- data.table::dcast(LABEL + RUN ~ FEATURE,
+ data = input,
+ value.var = "newABUNDANCE", keep = TRUE
+ )
+ tmp_fitted <- median_polish_summary(as.matrix(wide[, features, with = FALSE]))
+ wide[, newABUNDANCE := tmp_fitted]
+ tmp_result <- wide[, list(LABEL, RUN, newABUNDANCE)]
+
+ if (data.table::uniqueN(input$LABEL) == 2) {
+ tmp_result <- .adjustLRuns(tmp_result)
+ }
+ tmp_result[, list(RUN, LogIntensities = newABUNDANCE)]
}
@@ -111,19 +127,19 @@
#' @return data.table
#' @importFrom stats median
#' @keywords internal
-.adjustLRuns = function(input, rename = FALSE) {
- LABEL = newABUNDANCE = RUN = newABUNDANCE.h = NULL
-
- h_runs = input[LABEL == "H", list(RUN, newABUNDANCE)]
- h_median = median(input[LABEL == "H", newABUNDANCE], na.rm = TRUE)
- input = input[LABEL == "L"]
- input = merge(input[, list(RUN, newABUNDANCE)], h_runs, by = "RUN", suffixes = c("", ".h"))
- input[, newABUNDANCE := newABUNDANCE - newABUNDANCE.h + h_median]
- if (rename) {
- input[, list(RUN, LogIntensities = newABUNDANCE)]
- } else {
- input[, list(RUN, newABUNDANCE)]
- }
+.adjustLRuns <- function(input, rename = FALSE) {
+ LABEL <- newABUNDANCE <- RUN <- newABUNDANCE.h <- NULL
+
+ h_runs <- input[LABEL == "H", list(RUN, newABUNDANCE)]
+ h_median <- median(input[LABEL == "H", newABUNDANCE], na.rm = TRUE)
+ input <- input[LABEL == "L"]
+ input <- merge(input[, list(RUN, newABUNDANCE)], h_runs, by = "RUN", suffixes = c("", ".h"))
+ input[, newABUNDANCE := newABUNDANCE - newABUNDANCE.h + h_median]
+ if (rename) {
+ input[, list(RUN, LogIntensities = newABUNDANCE)]
+ } else {
+ input[, list(RUN, newABUNDANCE)]
+ }
}
@@ -131,18 +147,18 @@
#' @inheritParams .runTukey
#' @return data.table
#' @keywords internal
-.getNonMissingFilterStats = function(input, censored_symbol) {
- if (!is.null(censored_symbol)) {
- if (censored_symbol == "NA") {
- nonmissing_filter = input$LABEL == "L" & !is.na(input$newABUNDANCE) & !input$censored
- } else {
- nonmissing_filter = input$LABEL == "L" & !is.na(input$newABUNDANCE) & !input$censored
- }
+.getNonMissingFilterStats <- function(input, censored_symbol) {
+ if (!is.null(censored_symbol)) {
+ if (censored_symbol == "NA") {
+ nonmissing_filter <- input$LABEL == "L" & !is.na(input$newABUNDANCE) & !input$censored
} else {
- nonmissing_filter = input$LABEL == "L" & !is.na(input$INTENSITY)
+ nonmissing_filter <- input$LABEL == "L" & !is.na(input$newABUNDANCE) & !input$censored
}
- nonmissing_filter = nonmissing_filter & input$n_obs_run > 0 & input$n_obs > 1
- nonmissing_filter
+ } else {
+ nonmissing_filter <- input$LABEL == "L" & !is.na(input$INTENSITY)
+ }
+ nonmissing_filter <- nonmissing_filter & input$n_obs_run > 0 & input$n_obs > 1
+ nonmissing_filter
}
@@ -154,27 +170,29 @@
#' @return lm or merMod
#' @importFrom stats lm
#' @keywords internal
-.fitLinearModel = function(input, is_single_feature, is_labeled,
- equal_variances) {
- if (!is_labeled) {
- if (is_single_feature) {
- linear_model = lm(ABUNDANCE ~ RUN , data = input)
- } else {
- linear_model = lm(ABUNDANCE ~ FEATURE + RUN , data = input)
- }
+.fitLinearModel <- function(input, is_single_feature, is_labeled,
+ equal_variances) {
+ if (!is_labeled) {
+ if (is_single_feature) {
+ linear_model <- lm(ABUNDANCE ~ RUN, data = input)
} else {
- if (is_single_feature) {
- linear_model = lm(ABUNDANCE ~ RUN + ref , data = input)
- } else {
- linear_model = lm(ABUNDANCE ~ FEATURE + RUN + ref, data = input)
- }
+ linear_model <- lm(ABUNDANCE ~ FEATURE + RUN, data = input)
}
- if (!equal_variances) {
- linear_model = .updateUnequalVariances(input = input,
- fit = linear_model,
- num_iter = 1)
+ } else {
+ if (is_single_feature) {
+ linear_model <- lm(ABUNDANCE ~ RUN + ref, data = input)
+ } else {
+ linear_model <- lm(ABUNDANCE ~ FEATURE + RUN + ref, data = input)
}
- linear_model
+ }
+ if (!equal_variances) {
+ linear_model <- .updateUnequalVariances(
+ input = input,
+ fit = linear_model,
+ num_iter = 1
+ )
+ }
+ linear_model
}
@@ -186,30 +204,31 @@
#' @importFrom stats loess resid lm formula
#' @return merMod
#' @keywords internal
-.updateUnequalVariances = function(input, fit, num_iter) {
- weight = NULL
-
- for (i in seq_len(num_iter)) {
- if (i == 1) {
- abs.resids = data.frame(abs.resids = abs(fit$residuals))
- fitted = data.frame(fitted = fit$fitted.values)
- input = data.frame(input,
- "abs.resids" = abs.resids,
- "fitted" = fitted)
- }
- fit.loess = loess(abs.resids ~ fitted, data = input)
- loess.fitted = data.frame(loess.fitted = fitted(fit.loess))
- input = data.frame(input, "loess.fitted" = loess.fitted)
- ## loess fitted valuaes are predicted sd
- input$weight = 1 / (input$loess.fitted ^ 2)
- input = input[, !(colnames(input) %in% "abs.resids")]
- ## re-fit using weight
- wls.fit = lm(formula(fit), data = input, weights = weight)
- abs.resids = data.frame(abs.resids = abs(wls.fit$residuals))
- input = data.frame(input, "abs.resids" = abs.resids)
- input = input[, -which(colnames(input) %in% c("loess.fitted", "weight"))]
+.updateUnequalVariances <- function(input, fit, num_iter) {
+ weight <- NULL
+
+ for (i in seq_len(num_iter)) {
+ if (i == 1) {
+ abs.resids <- data.frame(abs.resids = abs(fit$residuals))
+ fitted <- data.frame(fitted = fit$fitted.values)
+ input <- data.frame(input,
+ "abs.resids" = abs.resids,
+ "fitted" = fitted
+ )
}
- wls.fit
+ fit.loess <- loess(abs.resids ~ fitted, data = input)
+ loess.fitted <- data.frame(loess.fitted = fitted(fit.loess))
+ input <- data.frame(input, "loess.fitted" = loess.fitted)
+ ## loess fitted valuaes are predicted sd
+ input$weight <- 1 / (input$loess.fitted^2)
+ input <- input[, !(colnames(input) %in% "abs.resids")]
+ ## re-fit using weight
+ wls.fit <- lm(formula(fit), data = input, weights = weight)
+ abs.resids <- data.frame(abs.resids = abs(wls.fit$residuals))
+ input <- data.frame(input, "abs.resids" = abs.resids)
+ input <- input[, -which(colnames(input) %in% c("loess.fitted", "weight"))]
+ }
+ wls.fit
}
@@ -217,6 +236,6 @@
#' @param input data.table
#' @return logical
#' @keywords internal
-.checkSingleFeature = function(input) {
- data.table::uniqueN(input$FEATURE) < 2
+.checkSingleFeature <- function(input) {
+ data.table::uniqueN(input$FEATURE) < 2
}
diff --git a/R/utils_summarization_prepare.R b/R/utils_summarization_prepare.R
index 251055da..718dc6d9 100644
--- a/R/utils_summarization_prepare.R
+++ b/R/utils_summarization_prepare.R
@@ -1,97 +1,100 @@
#' Prepare feature-level data for protein-level summarization
-#'
+#'
#' @param input feature-level data processed by dataProcess subfunctions
#' @param method summarization method - `summaryMethod` parameter of the dataProcess function
#' @param impute if TRUE, censored missing values will be imputed - `MBimpute`
#' parameter of the dataProcess function
-#' @param censored_symbol censored missing value indicator - `censoredInt`
+#' @param censored_symbol censored missing value indicator - `censoredInt`
#' parameter of the dataProcess function
-#' @param remove_uninformative_feature_outlier if TRUE, features labeled as
-#' outlier of uninformative by the MSstatsSelectFeatures function will not be
+#' @param remove_uninformative_feature_outlier if TRUE, features labeled as
+#' outlier of uninformative by the MSstatsSelectFeatures function will not be
#' used in summarization
-#'
+#'
#' @return data.table
-#'
+#'
#' @export
-#'
-#' @examples
-#' raw = DDARawData
-#' method = "TMP"
-#' cens = "NA"
-#' impute = TRUE
+#'
+#' @examples
+#' raw <- DDARawData
+#' method <- "TMP"
+#' cens <- "NA"
+#' impute <- TRUE
#' MSstatsConvert::MSstatsLogsSettings(FALSE)
-#' input = MSstatsPrepareForDataProcess(raw, 2, NULL)
+#' input <- MSstatsPrepareForDataProcess(raw, 2, NULL)
#' head(input)
-#'
-MSstatsPrepareForSummarization = function(input, method, impute, censored_symbol,
- remove_uninformative_feature_outlier) {
- ABUNDANCE = feature_quality = is_outlier = PROTEIN = NULL
-
- label = data.table::uniqueN(input$LABEL) == 2
- if (label) {
- input[, ref := factor(ifelse(LABEL == "L", RUN, 0))]
- }
-
- if (is.element("remove", colnames(input))) {
- input = input[!(remove)]
- }
-
- if (remove_uninformative_feature_outlier &
- is.element("feature_quality", colnames(input))) {
- input[, ABUNDANCE := ifelse(feature_quality == "Uninformative",
- NA, ABUNDANCE)]
- input[, ABUNDANCE := ifelse(is_outlier, NA, ABUNDANCE)]
- msg = "** Filtered out uninformative features and outliers."
- getOption("MSstatsLog")("INFO", msg)
- getOption("MSstatsMsg")("INFO", msg)
- }
-
- input = .prepareSummary(input, method, impute, censored_symbol)
- input[, PROTEIN := factor(PROTEIN)]
- input
+#'
+MSstatsPrepareForSummarization <- function(input, method, impute, censored_symbol,
+ remove_uninformative_feature_outlier) {
+ ABUNDANCE <- feature_quality <- is_outlier <- PROTEIN <- NULL
+
+ label <- data.table::uniqueN(input$LABEL) == 2
+ if (label) {
+ input[, ref := factor(ifelse(LABEL == "L", RUN, 0))]
+ }
+
+ if (is.element("remove", colnames(input))) {
+ input <- input[!(remove)]
+ }
+
+ if (remove_uninformative_feature_outlier &
+ is.element("feature_quality", colnames(input))) {
+ input[, ABUNDANCE := ifelse(feature_quality == "Uninformative",
+ NA, ABUNDANCE
+ )]
+ input[, ABUNDANCE := ifelse(is_outlier, NA, ABUNDANCE)]
+ msg <- "** Filtered out uninformative features and outliers."
+ getOption("MSstatsLog")("INFO", msg)
+ getOption("MSstatsMsg")("INFO", msg)
+ }
+
+ input <- .prepareSummary(input, method, impute, censored_symbol)
+ input[, PROTEIN := factor(PROTEIN)]
+ input
}
#' Get feature-level data to be used in the MSstatsSummarizationOutput function
-#'
+#'
#' @param input data.table processed by dataProcess subfunctions
-#'
+#'
#' @return data.table processed by dataProcess subfunctions
-#'
+#'
#' @export
-#'
-#' @examples
-#' raw = DDARawData
-#' method = "TMP"
-#' cens = "NA"
-#' impute = TRUE
+#'
+#' @examples
+#' raw <- DDARawData
+#' method <- "TMP"
+#' cens <- "NA"
+#' impute <- TRUE
#' MSstatsConvert::MSstatsLogsSettings(FALSE)
-#' input = MSstatsPrepareForDataProcess(raw, 2, NULL)
-#' input = MSstatsNormalize(input, "EQUALIZEMEDIANS")
-#' input = MSstatsMergeFractions(input)
-#' input = MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
-#' input_all = MSstatsSelectFeatures(input, "all") # all features
-#' input_5 = MSstatsSelectFeatures(data.table::copy(input),
-#' "topN", top_n = 5) # top 5 features
-#'
-#' proc1 = getProcessed(input_all)
-#' proc2 = getProcessed(input_5)
-#'
+#' input <- MSstatsPrepareForDataProcess(raw, 2, NULL)
+#' input <- MSstatsNormalize(input, "EQUALIZEMEDIANS")
+#' input <- MSstatsMergeFractions(input)
+#' input <- MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
+#' input_all <- MSstatsSelectFeatures(input, "all") # all features
+#' input_5 <- MSstatsSelectFeatures(data.table::copy(input),
+#' "topN",
+#' top_n = 5
+#' ) # top 5 features
+#'
+#' proc1 <- getProcessed(input_all)
+#' proc2 <- getProcessed(input_5)
+#'
#' proc1
#' proc2
-#'
-getProcessed = function(input) {
- remove = NULL
-
- if (is.element("remove", colnames(input))) {
- if (all(!(input$remove))) {
- NULL
- } else {
- input[(remove)]
- }
+#'
+getProcessed <- function(input) {
+ remove <- NULL
+
+ if (is.element("remove", colnames(input))) {
+ if (all(!(input$remove))) {
+ NULL
} else {
- NULL
+ input[(remove)]
}
+ } else {
+ NULL
+ }
}
@@ -102,13 +105,13 @@ getProcessed = function(input) {
#' @param censored_symbol "0"/"NA"
#' @return data.table
#' @keywords internal
-.prepareSummary = function(input, method, impute, censored_symbol) {
- if (method == "TMP") {
- input = .prepareTMP(input, impute, censored_symbol)
- } else {
- input = .prepareLinear(input, FALSE, censored_symbol)
- }
- input
+.prepareSummary <- function(input, method, impute, censored_symbol) {
+ if (method == "TMP") {
+ input <- .prepareTMP(input, impute, censored_symbol)
+ } else {
+ input <- .prepareLinear(input, FALSE, censored_symbol)
+ }
+ input
}
@@ -116,21 +119,22 @@ getProcessed = function(input) {
#' @inheritParams .prepareSummary
#' @return data.table
#' @keywords internal
-.prepareLinear = function(input, impute, censored_symbol) {
- newABUNDANCE = ABUNDANCE = nonmissing = n_obs = n_obs_run = NULL
- total_features = FEATURE = prop_features = NULL
-
- input[, newABUNDANCE := ABUNDANCE]
- input[, nonmissing := .getNonMissingFilter(.SD, impute, censored_symbol)]
- input[, n_obs := sum(nonmissing), by = c("PROTEIN", "FEATURE")]
- # remove feature with 1 measurement
- input[, nonmissing := ifelse(n_obs <= 1, FALSE, nonmissing)]
- input[, n_obs_run := sum(nonmissing), by = c("PROTEIN", "RUN")]
-
- input[, total_features := uniqueN(FEATURE), by = "PROTEIN"]
- input[, prop_features := sum(nonmissing) / total_features,
- by = c("PROTEIN", "RUN")]
- input
+.prepareLinear <- function(input, impute, censored_symbol) {
+ newABUNDANCE <- ABUNDANCE <- nonmissing <- n_obs <- n_obs_run <- NULL
+ total_features <- FEATURE <- prop_features <- NULL
+
+ input[, newABUNDANCE := ABUNDANCE]
+ input[, nonmissing := .getNonMissingFilter(.SD, impute, censored_symbol)]
+ input[, n_obs := sum(nonmissing), by = c("PROTEIN", "FEATURE")]
+ # remove feature with 1 measurement
+ input[, nonmissing := ifelse(n_obs <= 1, FALSE, nonmissing)]
+ input[, n_obs_run := sum(nonmissing), by = c("PROTEIN", "RUN")]
+
+ input[, total_features := uniqueN(FEATURE), by = "PROTEIN"]
+ input[, prop_features := sum(nonmissing) / total_features,
+ by = c("PROTEIN", "RUN")
+ ]
+ input
}
@@ -138,40 +142,42 @@ getProcessed = function(input) {
#' @inheritParams .prepareSummary
#' @return data.table
#' @keywords internal
-.prepareTMP = function(input, impute, censored_symbol) {
- censored = feature_quality = newABUNDANCE = cen = nonmissing = n_obs = NULL
- n_obs_run = total_features = FEATURE = prop_features = NULL
- remove50missing = ABUNDANCE = NULL
-
- if (impute & !is.null(censored_symbol)) {
- if (is.element("feature_quality", colnames(input))) {
- input[, censored := ifelse(feature_quality == "Informative",
- censored, FALSE)]
- }
- if (censored_symbol == "0") {
- input[, newABUNDANCE := ifelse(censored, 0, ABUNDANCE)]
- } else if (censored_symbol == "NA") {
- input[, newABUNDANCE := ifelse(censored, NA, ABUNDANCE)]
- }
- input[, cen := ifelse(censored, 0, 1)]
- } else {
- input[, newABUNDANCE := ABUNDANCE]
+.prepareTMP <- function(input, impute, censored_symbol) {
+ censored <- feature_quality <- newABUNDANCE <- cen <- nonmissing <- n_obs <- NULL
+ n_obs_run <- total_features <- FEATURE <- prop_features <- NULL
+ remove50missing <- ABUNDANCE <- NULL
+
+ if (impute & !is.null(censored_symbol)) {
+ if (is.element("feature_quality", colnames(input))) {
+ input[, censored := ifelse(feature_quality == "Informative",
+ censored, FALSE
+ )]
}
-
- input[, nonmissing := .getNonMissingFilter(input, impute, censored_symbol)]
- input[, n_obs := sum(nonmissing), by = c("PROTEIN", "FEATURE")]
- input[, nonmissing := ifelse(n_obs <= 1, FALSE, nonmissing)]
- input[, n_obs_run := sum(nonmissing), by = c("PROTEIN", "RUN")]
-
- input[, total_features := uniqueN(FEATURE), by = "PROTEIN"]
- input[, prop_features := sum(nonmissing) / total_features,
- by = c("PROTEIN", "RUN")]
-
- if (is.element("cen", colnames(input))) {
- if (any(input[["cen"]] == 0)) {
- .setCensoredByThreshold(input, censored_symbol, remove50missing)
- }
+ if (censored_symbol == "0") {
+ input[, newABUNDANCE := ifelse(censored, 0, ABUNDANCE)]
+ } else if (censored_symbol == "NA") {
+ input[, newABUNDANCE := ifelse(censored, NA, ABUNDANCE)]
+ }
+ input[, cen := ifelse(censored, 0, 1)]
+ } else {
+ input[, newABUNDANCE := ABUNDANCE]
+ }
+
+ input[, nonmissing := .getNonMissingFilter(input, impute, censored_symbol)]
+ input[, n_obs := sum(nonmissing), by = c("PROTEIN", "FEATURE")]
+ input[, nonmissing := ifelse(n_obs <= 1, FALSE, nonmissing)]
+ input[, n_obs_run := sum(nonmissing), by = c("PROTEIN", "RUN")]
+
+ input[, total_features := uniqueN(FEATURE), by = "PROTEIN"]
+ input[, prop_features := sum(nonmissing) / total_features,
+ by = c("PROTEIN", "RUN")
+ ]
+
+ if (is.element("cen", colnames(input))) {
+ if (any(input[["cen"]] == 0)) {
+ .setCensoredByThreshold(input, censored_symbol, remove50missing)
}
+ }
- input
+ input
}
diff --git a/benchmark/benchmark_Dowell2021-HEqe408_LFQ.R b/benchmark/benchmark_Dowell2021-HEqe408_LFQ.R
index 4d0811f8..db34aeea 100644
--- a/benchmark/benchmark_Dowell2021-HEqe408_LFQ.R
+++ b/benchmark/benchmark_Dowell2021-HEqe408_LFQ.R
@@ -21,7 +21,7 @@ fragpipe_raw <- data.table::fread(dataset_config$file)
head(fragpipe_raw)
-msstats_format = MSstatsConvert::FragPipetoMSstatsFormat(fragpipe_raw, use_log_file = FALSE)
+msstats_format <- MSstatsConvert::FragPipetoMSstatsFormat(fragpipe_raw, use_log_file = FALSE)
data_process_tasks <- list(
@@ -45,11 +45,11 @@ data_process_tasks <- list(
start_time <- Sys.time()
-num_cores <- detectCores() - 1
+num_cores <- detectCores() - 1
summarized_results <- mclapply(data_process_tasks, function(task) {
list(label = task$label, summarized = task$result())
-}, mc.cores = num_cores)
+}, mc.cores = num_cores)
results_list <- mclapply(summarized_results, function(res) {
@@ -61,4 +61,4 @@ final_results <- do.call(rbind, results_list)
end_time <- Sys.time()
total_time <- end_time - start_time
print(final_results)
-print(paste("Total Execution Time:", total_time))
\ No newline at end of file
+print(paste("Total Execution Time:", total_time))
diff --git a/benchmark/benchmark_Puyvelde2022-HYE5600735_LFQ.R b/benchmark/benchmark_Puyvelde2022-HYE5600735_LFQ.R
index 389a46d5..d1875253 100644
--- a/benchmark/benchmark_Puyvelde2022-HYE5600735_LFQ.R
+++ b/benchmark/benchmark_Puyvelde2022-HYE5600735_LFQ.R
@@ -20,7 +20,7 @@ start_time <- Sys.time()
fragpipe_raw <- data.table::fread(dataset_config$file)
head(fragpipe_raw)
-msstats_format = MSstatsConvert::FragPipetoMSstatsFormat(fragpipe_raw, use_log_file = FALSE)
+msstats_format <- MSstatsConvert::FragPipetoMSstatsFormat(fragpipe_raw, use_log_file = FALSE)
data_process_tasks <- list(
@@ -44,11 +44,11 @@ data_process_tasks <- list(
start_time <- Sys.time()
-num_cores <- detectCores() - 1
+num_cores <- detectCores() - 1
summarized_results <- mclapply(data_process_tasks, function(task) {
list(label = task$label, summarized = task$result())
-}, mc.cores = num_cores)
+}, mc.cores = num_cores)
results_list <- mclapply(summarized_results, function(res) {
@@ -60,4 +60,4 @@ final_results <- do.call(rbind, results_list)
end_time <- Sys.time()
total_time <- end_time - start_time
print(final_results)
-print(paste("Total Execution Time:", total_time))
\ No newline at end of file
+print(paste("Total Execution Time:", total_time))
diff --git a/benchmark/calculateMetrics.R b/benchmark/calculateMetrics.R
index 3dacfdc0..5b81656e 100644
--- a/benchmark/calculateMetrics.R
+++ b/benchmark/calculateMetrics.R
@@ -5,18 +5,18 @@ library(stringr)
calculateResult <- function(summarized, label, samples) {
model <- groupComparison("pairwise", summarized)
comparisonResult <- model$ComparisonResult
-
+
TP <- 0
FP <- 0
TN <- 0
FN <- 0
-
+
for (sample_name in names(samples)) {
sample <- samples[[sample_name]]
is_significant <- sample$type == "significant"
-
+
filtered_proteins <- comparisonResult %>% filter(grepl(sample$pattern, Protein))
-
+
if (is_significant) {
TP <- TP + nrow(filtered_proteins %>% filter(adj.pvalue < 0.05))
FN <- FN + nrow(filtered_proteins %>% filter(adj.pvalue >= 0.05))
@@ -25,18 +25,18 @@ calculateResult <- function(summarized, label, samples) {
TN <- TN + nrow(filtered_proteins %>% filter(adj.pvalue >= 0.05))
}
}
-
+
FPR <- FP / (FP + TN)
accuracy <- (TP + TN) / (TP + TN + FP + FN)
recall <- TP / (TP + FN)
fdr <- FP / (FP + TP)
-
+
cat("Metrics for Label:", label, "\n")
cat("True Positives (TP):", TP, "\n")
cat("False Positives (FP):", FP, "\n")
cat("True Negatives (TN):", TN, "\n")
cat("False Negatives (FN):", FN, "\n\n")
-
+
comparisonResult %>%
filter(is.finite(log2FC)) %>%
ggplot(aes(y = log2FC)) +
@@ -44,7 +44,7 @@ calculateResult <- function(summarized, label, samples) {
geom_hline(yintercept = -2, linetype = "dashed", color = "red", linewidth = 1.5) +
theme_bw() +
labs(title = paste("Boxplot of log2FC for", label), y = "log2FC")
-
+
results <- data.frame(
Label = label,
TP = TP,
@@ -56,6 +56,6 @@ calculateResult <- function(summarized, label, samples) {
Recall = recall,
FDR = fdr
)
-
+
return(results)
}
diff --git a/inst/tinytest/test_MSstatsdev.R b/inst/tinytest/test_MSstatsdev.R
index aa3f72e0..77f83e8e 100644
--- a/inst/tinytest/test_MSstatsdev.R
+++ b/inst/tinytest/test_MSstatsdev.R
@@ -1,4 +1,2 @@
-
# Placeholder with simple test
expect_equal(1 + 1, 2)
-
diff --git a/inst/tinytest/test_dataProcess.R b/inst/tinytest/test_dataProcess.R
index 3b2a8cf0..f70b0914 100644
--- a/inst/tinytest/test_dataProcess.R
+++ b/inst/tinytest/test_dataProcess.R
@@ -1,27 +1,39 @@
# Test dataProcess with default parameters ------------------------------------
-QuantDataDefault = dataProcess(SRMRawData, use_log_file = FALSE)
-QuantDataDefaultLinear = dataProcess(DDARawData, use_log_file = FALSE,
- summaryMethod = "linear")
+QuantDataDefault <- dataProcess(SRMRawData, use_log_file = FALSE)
+QuantDataDefaultLinear <- dataProcess(DDARawData,
+ use_log_file = FALSE,
+ summaryMethod = "linear"
+)
# Test dataProcess with numberOfCores parameter ----------------------
-QuantDataParallel = dataProcess(SRMRawData, use_log_file = FALSE,
- numberOfCores = 2)
-QuantDataParallelLinear = dataProcess(DDARawData, use_log_file = FALSE,
- summaryMethod = "linear", numberOfCores = 2)
+QuantDataParallel <- dataProcess(SRMRawData,
+ use_log_file = FALSE,
+ numberOfCores = 2
+)
+QuantDataParallelLinear <- dataProcess(DDARawData,
+ use_log_file = FALSE,
+ summaryMethod = "linear", numberOfCores = 2
+)
-expect_equal(nrow(QuantDataDefault$FeatureLevelData),
- nrow(QuantDataParallel$FeatureLevelData))
+expect_equal(
+ nrow(QuantDataDefault$FeatureLevelData),
+ nrow(QuantDataParallel$FeatureLevelData)
+)
-expect_equal(nrow(QuantDataDefaultLinear$FeatureLevelData),
- nrow(QuantDataParallelLinear$FeatureLevelData))
+expect_equal(
+ nrow(QuantDataDefaultLinear$FeatureLevelData),
+ nrow(QuantDataParallelLinear$FeatureLevelData)
+)
# Test dataProcess with technical replicates & fractions ------------------
-msstats_input_fractions_techreps = data.table::fread(
- system.file("tinytest/processed_data/input_techreps_fractions.csv",
- package = "MSstats")
+msstats_input_fractions_techreps <- data.table::fread(
+ system.file("tinytest/processed_data/input_techreps_fractions.csv",
+ package = "MSstats"
+ )
+)
+QuantDataTechRepsFractions <- dataProcess(msstats_input_fractions_techreps,
+ use_log_file = FALSE
)
-QuantDataTechRepsFractions = dataProcess(msstats_input_fractions_techreps,
- use_log_file = FALSE)
expect_true(!is.null(QuantDataTechRepsFractions))
-expect_true(nrow(QuantDataTechRepsFractions$FeatureLevelData) > 0)
\ No newline at end of file
+expect_true(nrow(QuantDataTechRepsFractions$FeatureLevelData) > 0)
diff --git a/inst/tinytest/test_groupComparison.R b/inst/tinytest/test_groupComparison.R
index fc3997a3..c7cef06e 100644
--- a/inst/tinytest/test_groupComparison.R
+++ b/inst/tinytest/test_groupComparison.R
@@ -1,20 +1,26 @@
# Setup ------------------------------------------------------------------
-QuantData = dataProcess(SRMRawData, use_log_file = FALSE)
-comparison = matrix(c(-1,0,0,0,0,0,1,0,0,0),nrow=1)
-row.names(comparison) = "T7-T1"
-groups = levels(QuantData$ProteinLevelData$GROUP)
-colnames(comparison) = groups[order(as.numeric(groups))]
+QuantData <- dataProcess(SRMRawData, use_log_file = FALSE)
+comparison <- matrix(c(-1, 0, 0, 0, 0, 0, 1, 0, 0, 0), nrow = 1)
+row.names(comparison) <- "T7-T1"
+groups <- levels(QuantData$ProteinLevelData$GROUP)
+colnames(comparison) <- groups[order(as.numeric(groups))]
# Test groupComparison with default parameters ---------------------------
-testResultDefaultComparison = groupComparison(contrast.matrix=comparison,
- data=QuantData,
- use_log_file = FALSE)
+testResultDefaultComparison <- groupComparison(
+ contrast.matrix = comparison,
+ data = QuantData,
+ use_log_file = FALSE
+)
# Test groupComparison with numberOfCores parameter ----------------------
-testResultParallelComparison = groupComparison(contrast.matrix=comparison,
- data=QuantData,
- use_log_file = FALSE,
- numberOfCores = 2)
+testResultParallelComparison <- groupComparison(
+ contrast.matrix = comparison,
+ data = QuantData,
+ use_log_file = FALSE,
+ numberOfCores = 2
+)
-expect_equal(nrow(testResultDefaultComparison$ComparisonResult),
- nrow(testResultParallelComparison$ComparisonResult))
\ No newline at end of file
+expect_equal(
+ nrow(testResultDefaultComparison$ComparisonResult),
+ nrow(testResultParallelComparison$ComparisonResult)
+)
diff --git a/inst/tinytest/test_groupComparisonQCPlots.R b/inst/tinytest/test_groupComparisonQCPlots.R
index a0bc4eec..4c9432c5 100644
--- a/inst/tinytest/test_groupComparisonQCPlots.R
+++ b/inst/tinytest/test_groupComparisonQCPlots.R
@@ -4,44 +4,43 @@ library(MSstats)
QuantData <- dataProcess(SRMRawData, use_log_file = FALSE)
-comparison <- matrix(c(-1,0,0,0,0,0,1,0,0,0),nrow=1)
+comparison <- matrix(c(-1, 0, 0, 0, 0, 0, 1, 0, 0, 0), nrow = 1)
row.names(comparison) <- "T7-T1"
colnames(comparison) <- unique(QuantData$ProteinLevelData$GROUP)
# Tests for differentially abundant proteins with models:
# label-based SRM experiment with expanded scope of biological replication.
-testResultOneComparison <- groupComparison(contrast.matrix=comparison, data=QuantData,use_log_file = FALSE)
+testResultOneComparison <- groupComparison(contrast.matrix = comparison, data = QuantData, use_log_file = FALSE)
-test_that('Validate groupComparisonQCPlots call', {
-
+test_that("Validate groupComparisonQCPlots call", {
mock_modelBasedQCPlots <- mock()
-
- stub(groupComparisonQCPlots, 'modelBasedQCPlots', mock_modelBasedQCPlots)
-
+
+ stub(groupComparisonQCPlots, "modelBasedQCPlots", mock_modelBasedQCPlots)
+
groupComparisonQCPlots(
- data = testResultOneComparison,
- type = "QQPlots",
- axis.size = 12,
- dot.size = 3,
- width = 7,
- height = 5,
- which.Protein = "P0A894",
+ data = testResultOneComparison,
+ type = "QQPlots",
+ axis.size = 12,
+ dot.size = 3,
+ width = 7,
+ height = 5,
+ which.Protein = "P0A894",
address = FALSE
)
-
+
expect_called(mock_modelBasedQCPlots, 1)
-
+
args <- mock_args(mock_modelBasedQCPlots)
-
- expect_equal(args[[1]][[1]], testResultOneComparison) # data
- expect_equal(args[[1]][[2]], "QQPlots") # type
- expect_equal(args[[1]][[3]], 12) # axis.size
- expect_equal(args[[1]][[4]], 3) # dot.size
- expect_equal(args[[1]][[5]], 7) # width
- expect_equal(args[[1]][[6]], 5) # height
- expect_equal(args[[1]][[7]], "P0A894") # which.Protein
- expect_equal(args[[1]][[8]], FALSE) # address
+
+ expect_equal(args[[1]][[1]], testResultOneComparison) # data
+ expect_equal(args[[1]][[2]], "QQPlots") # type
+ expect_equal(args[[1]][[3]], 12) # axis.size
+ expect_equal(args[[1]][[4]], 3) # dot.size
+ expect_equal(args[[1]][[5]], 7) # width
+ expect_equal(args[[1]][[6]], 5) # height
+ expect_equal(args[[1]][[7]], "P0A894") # which.Protein
+ expect_equal(args[[1]][[8]], FALSE) # address
})
diff --git a/inst/tinytest/test_making_contrasts.R b/inst/tinytest/test_making_contrasts.R
index 30c7ba27..fa836f81 100644
--- a/inst/tinytest/test_making_contrasts.R
+++ b/inst/tinytest/test_making_contrasts.R
@@ -3,6 +3,6 @@
# list("a", c("b", "c")))
# conditions = c("a", "b", "c")
# labels = c("a vs b", "a vs c", "a vs b,c")
-#
+#
# MSstatsContrastMatrix(contrasts, conditions, labels)
# MSstatsContrastMatrix(contrasts, conditions, NULL)
diff --git a/inst/tinytest/test_utils_censored.R b/inst/tinytest/test_utils_censored.R
index 78dad7c7..395deecb 100644
--- a/inst/tinytest/test_utils_censored.R
+++ b/inst/tinytest/test_utils_censored.R
@@ -1,15 +1,15 @@
# Test .setCensoredByThreshold
dt_na <- data.table::data.table(
- PROTEIN = c("P1", "P1", "P1", "P1", "P1", "P1", "P1", "P1", "P1", "P1", "P1", "P1", "P2", "P2", "P2", "P2", "P2", "P2", "P2", "P2", "P2", "P2", "P2", "P2"),
- FEATURE = c("F1", "F1", "F2", "F2", "F1", "F2", "F1", "F1", "F2", "F2", "F1", "F2", "F3", "F3", "F4", "F4", "F3", "F4", "F3", "F3", "F4", "F4", "F3", "F4"),
- LABEL = c("L", "L", "L", "L", "H", "H", "L", "L", "L", "L", "H", "H", "L", "L", "L", "L", "H", "H", "L", "L", "L", "L", "H", "H"),
- RUN = c("R1", "R2", "R1", "R2", "R1", "R2", "R3", "R4", "R3", "R4", "R3", "R4", "R1", "R2", "R1", "R2", "R1", "R2", "R3", "R4", "R3", "R4", "R3", "R4"),
- newABUNDANCE = c(1.5, NA, 2, 2.2, 1.4, 1.6, 1.5, 1.9, 2, 2.2, 1.4, 1.6, 0, 4, 2.5, NA, 3, 3.2, NA, 4, NA, NA, 3, 3.2),
- censored = c(FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, TRUE, FALSE, TRUE, TRUE, FALSE, FALSE),
- nonmissing = c(TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE),
- n_obs = c(3, 3, 4, 4, 3, 4, 3, 3, 4, 4, 3, 4, 3, 3, 1, 1, 3, 1, 3, 3, 1, 1, 3, 1),
- n_obs_run = c(2, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1),
- total_features = c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)
+ PROTEIN = c("P1", "P1", "P1", "P1", "P1", "P1", "P1", "P1", "P1", "P1", "P1", "P1", "P2", "P2", "P2", "P2", "P2", "P2", "P2", "P2", "P2", "P2", "P2", "P2"),
+ FEATURE = c("F1", "F1", "F2", "F2", "F1", "F2", "F1", "F1", "F2", "F2", "F1", "F2", "F3", "F3", "F4", "F4", "F3", "F4", "F3", "F3", "F4", "F4", "F3", "F4"),
+ LABEL = c("L", "L", "L", "L", "H", "H", "L", "L", "L", "L", "H", "H", "L", "L", "L", "L", "H", "H", "L", "L", "L", "L", "H", "H"),
+ RUN = c("R1", "R2", "R1", "R2", "R1", "R2", "R3", "R4", "R3", "R4", "R3", "R4", "R1", "R2", "R1", "R2", "R1", "R2", "R3", "R4", "R3", "R4", "R3", "R4"),
+ newABUNDANCE = c(1.5, NA, 2, 2.2, 1.4, 1.6, 1.5, 1.9, 2, 2.2, 1.4, 1.6, 0, 4, 2.5, NA, 3, 3.2, NA, 4, NA, NA, 3, 3.2),
+ censored = c(FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, TRUE, FALSE, TRUE, TRUE, FALSE, FALSE),
+ nonmissing = c(TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE),
+ n_obs = c(3, 3, 4, 4, 3, 4, 3, 3, 4, 4, 3, 4, 3, 3, 1, 1, 3, 1, 3, 3, 1, 1, 3, 1),
+ n_obs_run = c(2, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1),
+ total_features = c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)
)
# === Run NA-based test ===
@@ -23,37 +23,57 @@ imputed_val_p1 <- dt_na[
]
expect_equal(imputed_val_p1, expected_val_p1)
non_imputed_val_p2 <- dt_na[
- PROTEIN == "P2" & FEATURE == "F3" & LABEL == "L" & RUN == "R3",
- newABUNDANCE
+ PROTEIN == "P2" & FEATURE == "F3" & LABEL == "L" & RUN == "R3",
+ newABUNDANCE
]
expect_true(is.na(non_imputed_val_p2))
non_imputed_val_p2_f4 <- dt_na[
- PROTEIN == "P2" & FEATURE == "F4" & LABEL == "L" & RUN == "R2",
- newABUNDANCE
+ PROTEIN == "P2" & FEATURE == "F4" & LABEL == "L" & RUN == "R2",
+ newABUNDANCE
]
expect_true(is.na(non_imputed_val_p2_f4))
dt_zero <- data.table::data.table(
- PROTEIN = c("P1", "P1", "P1", "P1", "P1", "P1", "P1", "P1", "P1", "P1", "P1", "P1",
- "P2", "P2", "P2", "P2", "P2", "P2", "P2", "P2", "P2", "P2", "P2", "P2"),
- FEATURE = c("F1", "F1", "F2", "F2", "F1", "F2", "F1", "F1", "F2", "F2", "F1", "F2",
- "F1", "F1", "F2", "F2", "F1", "F2", "F1", "F1", "F2", "F2", "F1", "F2"),
- LABEL = c("L", "L", "L", "L", "H", "H", "L", "L", "L", "L", "H", "H",
- "L", "L", "L", "L", "H", "H", "L", "L", "L", "L", "H", "H"),
- RUN = c("R1", "R2", "R1", "R2", "R1", "R2", "R3", "R4", "R3", "R4", "R3", "R4",
- "R1", "R2", "R1", "R2", "R1", "R2", "R3", "R4", "R3", "R4", "R3", "R4"),
- newABUNDANCE = c(1.5, NA, 2.0, 2.2, 1.4, 1.6, 1.5, 1.9, 2.0, 2.2, 1.4, 1.6,
- 0.0, 4.0, 2.5, 2.7, 3.0, 3.2, 1.7, 4.0, 2.5, 2.7, 3.0, 3.2),
- censored = c(FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
- TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE),
- nonmissing = c(TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE,
- FALSE, TRUE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE),
- n_obs = c(3, 3, 4, 4, 3, 4, 3, 3, 4, 4, 3, 4,
- 3, 3, 4, 4, 3, 4, 3, 3, 4, 4, 3, 4),
- n_obs_run = c(2, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2,
- 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2),
- total_features = c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)
+ PROTEIN = c(
+ "P1", "P1", "P1", "P1", "P1", "P1", "P1", "P1", "P1", "P1", "P1", "P1",
+ "P2", "P2", "P2", "P2", "P2", "P2", "P2", "P2", "P2", "P2", "P2", "P2"
+ ),
+ FEATURE = c(
+ "F1", "F1", "F2", "F2", "F1", "F2", "F1", "F1", "F2", "F2", "F1", "F2",
+ "F1", "F1", "F2", "F2", "F1", "F2", "F1", "F1", "F2", "F2", "F1", "F2"
+ ),
+ LABEL = c(
+ "L", "L", "L", "L", "H", "H", "L", "L", "L", "L", "H", "H",
+ "L", "L", "L", "L", "H", "H", "L", "L", "L", "L", "H", "H"
+ ),
+ RUN = c(
+ "R1", "R2", "R1", "R2", "R1", "R2", "R3", "R4", "R3", "R4", "R3", "R4",
+ "R1", "R2", "R1", "R2", "R1", "R2", "R3", "R4", "R3", "R4", "R3", "R4"
+ ),
+ newABUNDANCE = c(
+ 1.5, NA, 2.0, 2.2, 1.4, 1.6, 1.5, 1.9, 2.0, 2.2, 1.4, 1.6,
+ 0.0, 4.0, 2.5, 2.7, 3.0, 3.2, 1.7, 4.0, 2.5, 2.7, 3.0, 3.2
+ ),
+ censored = c(
+ FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE,
+ TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE
+ ),
+ nonmissing = c(
+ TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE,
+ FALSE, TRUE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, FALSE, FALSE
+ ),
+ n_obs = c(
+ 3, 3, 4, 4, 3, 4, 3, 3, 4, 4, 3, 4,
+ 3, 3, 4, 4, 3, 4, 3, 3, 4, 4, 3, 4
+ ),
+ n_obs_run = c(
+ 2, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2,
+ 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2
+ ),
+ total_features = c(
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+ )
)
MSstats:::.setCensoredByThreshold(dt_zero, censored_symbol = "0", remove50missing = FALSE)
@@ -64,4 +84,3 @@ imputed_val_p2 <- dt_zero[
newABUNDANCE
]
expect_equal(imputed_val_p2, expected_val_p2)
-
diff --git a/inst/tinytest/test_utils_groupcomparison_checks.R b/inst/tinytest/test_utils_groupcomparison_checks.R
index c4c54994..9ef3987d 100644
--- a/inst/tinytest/test_utils_groupcomparison_checks.R
+++ b/inst/tinytest/test_utils_groupcomparison_checks.R
@@ -1,32 +1,41 @@
# Test .checkTechReplicate ------------------------------------------------
# Test 1: No technical replicates
-input = data.table::data.table(GROUP = c("A", "A", "B", "B"),
- RUN = c(1, 2, 3, 4),
- SUBJECT = c(1, 2, 3, 4))
+input <- data.table::data.table(
+ GROUP = c("A", "A", "B", "B"),
+ RUN = c(1, 2, 3, 4),
+ SUBJECT = c(1, 2, 3, 4)
+)
expect_false(MSstats:::.checkTechReplicate(input))
# Test 2: Repeated Measures - No technical replicates
-input = data.table::data.table(GROUP = c("A", "A", "B", "B"),
- RUN = c(1, 2, 3, 4),
- SUBJECT = c(1, 2, 1, 2))
+input <- data.table::data.table(
+ GROUP = c("A", "A", "B", "B"),
+ RUN = c(1, 2, 3, 4),
+ SUBJECT = c(1, 2, 1, 2)
+)
expect_false(MSstats:::.checkTechReplicate(input))
# Test 3: Technical replicates for one subject
-input = data.table::data.table(GROUP = c("A", "A", "B", "B"),
- RUN = c(1, 2, 3, 4),
- SUBJECT = c(1, 1, 2, 3))
+input <- data.table::data.table(
+ GROUP = c("A", "A", "B", "B"),
+ RUN = c(1, 2, 3, 4),
+ SUBJECT = c(1, 1, 2, 3)
+)
expect_true(MSstats:::.checkTechReplicate(input))
# Test 4: Technical replicates for all subjects
-input = data.table::data.table(GROUP = c("A", "A", "B", "B"),
- RUN = c(1, 2, 3, 4),
- SUBJECT = c(1, 1, 2, 2))
+input <- data.table::data.table(
+ GROUP = c("A", "A", "B", "B"),
+ RUN = c(1, 2, 3, 4),
+ SUBJECT = c(1, 1, 2, 2)
+)
expect_true(MSstats:::.checkTechReplicate(input))
# Test 5: Repeated Measures - Technical replicates
-input = data.table::data.table(GROUP = c("A", "A", "B", "B", "B"),
- RUN = c(1, 2, 3, 4, 5),
- SUBJECT = c(1, 2, 1, 2, 2))
+input <- data.table::data.table(
+ GROUP = c("A", "A", "B", "B", "B"),
+ RUN = c(1, 2, 3, 4, 5),
+ SUBJECT = c(1, 2, 1, 2, 2)
+)
expect_true(MSstats:::.checkTechReplicate(input))
-
diff --git a/inst/tinytest/test_utils_statistics.R b/inst/tinytest/test_utils_statistics.R
index 4d07f89e..fab2838b 100644
--- a/inst/tinytest/test_utils_statistics.R
+++ b/inst/tinytest/test_utils_statistics.R
@@ -5,7 +5,7 @@ log_env$messages <- character()
options(MSstatsLog = function(level, msg) {
log_env$messages <- c(log_env$messages, msg)
})
-options(MSstatsMsg = function(level, msg) {}) # no-op for this test
+options(MSstatsMsg = function(level, msg) {}) # no-op for this test
## Simulate a small, realistic MS data input
input <- data.table::data.table(
@@ -58,26 +58,26 @@ log_env$messages <- character()
## Override logging options with closures that update the environment
options(MSstatsLog = function(level, msg) {
- log_env$messages <- c(log_env$messages, msg)
+ log_env$messages <- c(log_env$messages, msg)
})
-options(MSstatsMsg = function(level, msg) {}) # optional no-op
+options(MSstatsMsg = function(level, msg) {}) # optional no-op
## Realistic test dataset
input <- data.table::data.table(
- LABEL = rep(c("L", "H"), each = 12),
- GROUP = rep(c("Control", "Treatment"), each = 6, times = 2),
- FEATURE = rep(c("PEPTIDE1", "PEPTIDE2", "PEPTIDE3"), times = 8),
- RUN = rep(paste0("Run", 1:4), each = 3, times = 2),
- ABUNDANCE = c(
- 10.5, 11.2, NA, # Run1
- 10.8, NA, NA, # Run2
- 11.0, 11.1, 11.2, # Run3
- NA, NA, NA, # Run4
- 9.5, 9.8, 10.0, # Run1
- NA, NA, NA, # Run2
- 9.2, 9.3, NA, # Run3
- NA, NA, NA # Run4
- )
+ LABEL = rep(c("L", "H"), each = 12),
+ GROUP = rep(c("Control", "Treatment"), each = 6, times = 2),
+ FEATURE = rep(c("PEPTIDE1", "PEPTIDE2", "PEPTIDE3"), times = 8),
+ RUN = rep(paste0("Run", 1:4), each = 3, times = 2),
+ ABUNDANCE = c(
+ 10.5, 11.2, NA, # Run1
+ 10.8, NA, NA, # Run2
+ 11.0, 11.1, 11.2, # Run3
+ NA, NA, NA, # Run4
+ 9.5, 9.8, 10.0, # Run1
+ NA, NA, NA, # Run2
+ 9.2, 9.3, NA, # Run3
+ NA, NA, NA # Run4
+ )
)
## Run the function
@@ -88,13 +88,13 @@ msgs <- log_env$messages
## Feature-level logging check
expect_true(
- any(grepl("Some features are completely missing.*PEPTIDE", msgs)),
- info = "Should log about completely missing features"
+ any(grepl("Some features are completely missing.*PEPTIDE", msgs)),
+ info = "Should log about completely missing features"
)
## Run-level logging check
expect_true(
- any(grepl("more than 75% missing values.*Run2", msgs)) &&
- any(grepl("Run4", msgs)),
- info = "Should log that Run2 and Run4 have >75% missing"
-)
\ No newline at end of file
+ any(grepl("more than 75% missing values.*Run2", msgs)) &&
+ any(grepl("Run4", msgs)),
+ info = "Should log that Run2 and Run4 have >75% missing"
+)
diff --git a/man/MSstatsContrastMatrix.Rd b/man/MSstatsContrastMatrix.Rd
index faf30fe3..a29e648c 100644
--- a/man/MSstatsContrastMatrix.Rd
+++ b/man/MSstatsContrastMatrix.Rd
@@ -8,7 +8,7 @@ MSstatsContrastMatrix(contrasts, conditions, labels = NULL)
}
\arguments{
\item{contrasts}{One of the following:
-i) list of lists. Each sub-list consists of two vectors that name
+i) list of lists. Each sub-list consists of two vectors that name
conditions that will be compared. See the details section for more information
ii) matrix. In this case, it's correctness will be checked
iii) "pairwise". In this case, pairwise comparison matrix will be generated
diff --git a/man/MSstatsGroupComparison.Rd b/man/MSstatsGroupComparison.Rd
index 0b48d455..e8048d4a 100644
--- a/man/MSstatsGroupComparison.Rd
+++ b/man/MSstatsGroupComparison.Rd
@@ -24,8 +24,8 @@ MSstatsGroupComparison(
\item{samples_info}{data.table, output of getSamplesInfo function}
-\item{numberOfCores}{Number of cores for parallel processing. When > 1,
-a logfile named `MSstats_groupComparison_log_progress.log` is created to
+\item{numberOfCores}{Number of cores for parallel processing. When > 1,
+a logfile named `MSstats_groupComparison_log_progress.log` is created to
track progress. Only works for Linux & Mac OS.}
}
\description{
@@ -33,16 +33,18 @@ Group comparison
}
\examples{
QuantData <- dataProcess(SRMRawData, use_log_file = FALSE)
-group_comparison_input = MSstatsPrepareForGroupComparison(QuantData)
+group_comparison_input <- MSstatsPrepareForGroupComparison(QuantData)
levels(QuantData$ProteinLevelData$GROUP)
-comparison <- matrix(c(-1,0,0,0,0,0,1,0,0,0),nrow=1)
+comparison <- matrix(c(-1, 0, 0, 0, 0, 0, 1, 0, 0, 0), nrow = 1)
row.names(comparison) <- "T7-T1"
-groups = levels(QuantData$ProteinLevelData$GROUP)
+groups <- levels(QuantData$ProteinLevelData$GROUP)
colnames(comparison) <- groups[order(as.numeric(groups))]
-samples_info = getSamplesInfo(QuantData)
-repeated = checkRepeatedDesign(QuantData)
-group_comparison = MSstatsGroupComparison(group_comparison_input, comparison,
- FALSE, repeated, samples_info)
+samples_info <- getSamplesInfo(QuantData)
+repeated <- checkRepeatedDesign(QuantData)
+group_comparison <- MSstatsGroupComparison(
+ group_comparison_input, comparison,
+ FALSE, repeated, samples_info
+)
length(group_comparison) # list of length equal to number of proteins
group_comparison[[1]][[1]] # data used to fit linear model
group_comparison[[1]][[2]] # comparison result
diff --git a/man/MSstatsGroupComparisonOutput.Rd b/man/MSstatsGroupComparisonOutput.Rd
index 31d2a5af..bdede0d7 100644
--- a/man/MSstatsGroupComparisonOutput.Rd
+++ b/man/MSstatsGroupComparisonOutput.Rd
@@ -21,18 +21,22 @@ Create output of group comparison based on results for individual proteins
}
\examples{
QuantData <- dataProcess(SRMRawData, use_log_file = FALSE)
-group_comparison_input = MSstatsPrepareForGroupComparison(QuantData)
+group_comparison_input <- MSstatsPrepareForGroupComparison(QuantData)
levels(QuantData$ProteinLevelData$GROUP)
-comparison <- matrix(c(-1,0,0,0,0,0,1,0,0,0),nrow=1)
+comparison <- matrix(c(-1, 0, 0, 0, 0, 0, 1, 0, 0, 0), nrow = 1)
row.names(comparison) <- "T7-T1"
-groups = levels(QuantData$ProteinLevelData$GROUP)
+groups <- levels(QuantData$ProteinLevelData$GROUP)
colnames(comparison) <- groups[order(as.numeric(groups))]
-samples_info = getSamplesInfo(QuantData)
-repeated = checkRepeatedDesign(QuantData)
-group_comparison = MSstatsGroupComparison(group_comparison_input, comparison,
- FALSE, repeated, samples_info)
-group_comparison_final = MSstatsGroupComparisonOutput(group_comparison,
- QuantData)
-group_comparison_final[["ComparisonResult"]]
-
+samples_info <- getSamplesInfo(QuantData)
+repeated <- checkRepeatedDesign(QuantData)
+group_comparison <- MSstatsGroupComparison(
+ group_comparison_input, comparison,
+ FALSE, repeated, samples_info
+)
+group_comparison_final <- MSstatsGroupComparisonOutput(
+ group_comparison,
+ QuantData
+)
+group_comparison_final[["ComparisonResult"]]
+
}
diff --git a/man/MSstatsGroupComparisonSingleProtein.Rd b/man/MSstatsGroupComparisonSingleProtein.Rd
index fa279ba1..b8de2d52 100644
--- a/man/MSstatsGroupComparisonSingleProtein.Rd
+++ b/man/MSstatsGroupComparisonSingleProtein.Rd
@@ -37,15 +37,16 @@ Group comparison for a single protein
QuantData <- dataProcess(SRMRawData, use_log_file = FALSE)
group_comparison_input <- MSstatsPrepareForGroupComparison(QuantData)
levels(QuantData$ProteinLevelData$GROUP)
-comparison <- matrix(c(-1,0,0,0,0,0,1,0,0,0),nrow=1)
+comparison <- matrix(c(-1, 0, 0, 0, 0, 0, 1, 0, 0, 0), nrow = 1)
row.names(comparison) <- "T7-T1"
-groups = levels(QuantData$ProteinLevelData$GROUP)
+groups <- levels(QuantData$ProteinLevelData$GROUP)
colnames(comparison) <- groups[order(as.numeric(groups))]
samples_info <- getSamplesInfo(QuantData)
repeated <- checkRepeatedDesign(QuantData)
single_output <- MSstatsGroupComparisonSingleProtein(
group_comparison_input[[1]], comparison, repeated, groups, samples_info,
- FALSE, TRUE)
+ FALSE, TRUE
+)
single_output # same as a single element of MSstatsGroupComparison output
}
diff --git a/man/MSstatsHandleMissing.Rd b/man/MSstatsHandleMissing.Rd
index 8fc72da2..65f9ef79 100644
--- a/man/MSstatsHandleMissing.Rd
+++ b/man/MSstatsHandleMissing.Rd
@@ -17,7 +17,7 @@ MSstatsHandleMissing(
\item{summary_method}{summarization method (`summaryMethod` parameter to `dataProcess`)}
-\item{impute}{if TRUE, missing values are supposed to be imputed
+\item{impute}{if TRUE, missing values are supposed to be imputed
(`MBimpute` parameter to `dataProcess`)}
\item{missing_symbol}{`censoredInt` parameter to `dataProcess`}
@@ -31,15 +31,15 @@ data.table
Handle censored missing values
}
\examples{
-raw = DDARawData
-method = "TMP"
-cens = "NA"
-impute = TRUE
+raw <- DDARawData
+method <- "TMP"
+cens <- "NA"
+impute <- TRUE
MSstatsConvert::MSstatsLogsSettings(FALSE)
-input = MSstatsPrepareForDataProcess(raw, 2, NULL)
-input = MSstatsNormalize(input, "EQUALIZEMEDIANS")
-input = MSstatsMergeFractions(input)
-input = MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
+input <- MSstatsPrepareForDataProcess(raw, 2, NULL)
+input <- MSstatsNormalize(input, "EQUALIZEMEDIANS")
+input <- MSstatsMergeFractions(input)
+input <- MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
head(input)
-
+
}
diff --git a/man/MSstatsMergeFractions.Rd b/man/MSstatsMergeFractions.Rd
index 2ee7e9f5..d5ab051b 100644
--- a/man/MSstatsMergeFractions.Rd
+++ b/man/MSstatsMergeFractions.Rd
@@ -16,14 +16,14 @@ data.table
Re-format the data before feature selection
}
\examples{
-raw = DDARawData
-method = "TMP"
-cens = "NA"
-impute = TRUE
+raw <- DDARawData
+method <- "TMP"
+cens <- "NA"
+impute <- TRUE
MSstatsConvert::MSstatsLogsSettings(FALSE)
-input = MSstatsPrepareForDataProcess(raw, 2, NULL)
-input = MSstatsNormalize(input, "EQUALIZEMEDIANS")
-input = MSstatsMergeFractions(input)
+input <- MSstatsPrepareForDataProcess(raw, 2, NULL)
+input <- MSstatsNormalize(input, "EQUALIZEMEDIANS")
+input <- MSstatsMergeFractions(input)
head(input)
}
diff --git a/man/MSstatsNormalize.Rd b/man/MSstatsNormalize.Rd
index 1da3e808..4d481fa9 100644
--- a/man/MSstatsNormalize.Rd
+++ b/man/MSstatsNormalize.Rd
@@ -19,10 +19,10 @@ MSstatsNormalize(
"QUANTILE" normalization for quantile normalization from `preprocessCore` package,
"GLOBALSTANDARDS" for normalization based on selected peptides or proteins.}
-\item{peptides_dict}{`data.table` of names of peptides and their corresponding
+\item{peptides_dict}{`data.table` of names of peptides and their corresponding
features.}
-\item{standards}{character vector with names of standards, required if
+\item{standards}{character vector with names of standards, required if
"GLOBALSTANDARDS" method was selected.}
}
\value{
@@ -32,13 +32,13 @@ data.table
Normalize MS data
}
\examples{
-raw = DDARawData
-method = "TMP"
-cens = "NA"
-impute = TRUE
+raw <- DDARawData
+method <- "TMP"
+cens <- "NA"
+impute <- TRUE
MSstatsConvert::MSstatsLogsSettings(FALSE)
-input = MSstatsPrepareForDataProcess(raw, 2, NULL)
-input = MSstatsNormalize(input, "EQUALIZEMEDIANS") # median normalization
+input <- MSstatsPrepareForDataProcess(raw, 2, NULL)
+input <- MSstatsNormalize(input, "EQUALIZEMEDIANS") # median normalization
head(input)
}
diff --git a/man/MSstatsPrepareForDataProcess.Rd b/man/MSstatsPrepareForDataProcess.Rd
index 364ec889..e6d3bb8f 100644
--- a/man/MSstatsPrepareForDataProcess.Rd
+++ b/man/MSstatsPrepareForDataProcess.Rd
@@ -23,12 +23,12 @@ data.table
Prepare data for processing by `dataProcess` function
}
\examples{
-raw = DDARawData
-method = "TMP"
-cens = "NA"
-impute = TRUE
+raw <- DDARawData
+method <- "TMP"
+cens <- "NA"
+impute <- TRUE
MSstatsConvert::MSstatsLogsSettings(FALSE)
-input = MSstatsPrepareForDataProcess(raw, 2, NULL)
+input <- MSstatsPrepareForDataProcess(raw, 2, NULL)
head(input)
}
diff --git a/man/MSstatsPrepareForGroupComparison.Rd b/man/MSstatsPrepareForGroupComparison.Rd
index 630180f4..aa3b1f5c 100644
--- a/man/MSstatsPrepareForGroupComparison.Rd
+++ b/man/MSstatsPrepareForGroupComparison.Rd
@@ -10,7 +10,7 @@ MSstatsPrepareForGroupComparison(summarization_output)
\item{summarization_output}{output of dataProcess}
}
\value{
-list of run-level data for each protein in the input.
+list of run-level data for each protein in the input.
This list has a "has_imputed" attribute that indicates if missing values
were imputed in the input dataset.
}
@@ -19,7 +19,7 @@ Prepare output for dataProcess for group comparison
}
\examples{
QuantData <- dataProcess(SRMRawData, use_log_file = FALSE)
-group_comparison_input = MSstatsPrepareForGroupComparison(QuantData)
+group_comparison_input <- MSstatsPrepareForGroupComparison(QuantData)
length(group_comparison_input) # list of length equal to number of proteins
# in protein-level data of QuantData
head(group_comparison_input[[1]])
diff --git a/man/MSstatsPrepareForSummarization.Rd b/man/MSstatsPrepareForSummarization.Rd
index dff9368a..7ca1dddb 100644
--- a/man/MSstatsPrepareForSummarization.Rd
+++ b/man/MSstatsPrepareForSummarization.Rd
@@ -20,11 +20,11 @@ MSstatsPrepareForSummarization(
\item{impute}{if TRUE, censored missing values will be imputed - `MBimpute`
parameter of the dataProcess function}
-\item{censored_symbol}{censored missing value indicator - `censoredInt`
+\item{censored_symbol}{censored missing value indicator - `censoredInt`
parameter of the dataProcess function}
-\item{remove_uninformative_feature_outlier}{if TRUE, features labeled as
-outlier of uninformative by the MSstatsSelectFeatures function will not be
+\item{remove_uninformative_feature_outlier}{if TRUE, features labeled as
+outlier of uninformative by the MSstatsSelectFeatures function will not be
used in summarization}
}
\value{
@@ -34,12 +34,12 @@ data.table
Prepare feature-level data for protein-level summarization
}
\examples{
-raw = DDARawData
-method = "TMP"
-cens = "NA"
-impute = TRUE
+raw <- DDARawData
+method <- "TMP"
+cens <- "NA"
+impute <- TRUE
MSstatsConvert::MSstatsLogsSettings(FALSE)
-input = MSstatsPrepareForDataProcess(raw, 2, NULL)
+input <- MSstatsPrepareForDataProcess(raw, 2, NULL)
head(input)
}
diff --git a/man/MSstatsSelectFeatures.Rd b/man/MSstatsSelectFeatures.Rd
index 155f0579..bc9dcc20 100644
--- a/man/MSstatsSelectFeatures.Rd
+++ b/man/MSstatsSelectFeatures.Rd
@@ -22,18 +22,18 @@ data.table
Feature selection before feature-level data summarization
}
\examples{
-raw = DDARawData
-method = "TMP"
-cens = "NA"
-impute = TRUE
+raw <- DDARawData
+method <- "TMP"
+cens <- "NA"
+impute <- TRUE
MSstatsConvert::MSstatsLogsSettings(FALSE)
-input = MSstatsPrepareForDataProcess(raw, 2, NULL)
-input = MSstatsNormalize(input, "EQUALIZEMEDIANS")
-input = MSstatsMergeFractions(input)
-input = MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
-input_all = MSstatsSelectFeatures(input, "all") # all features
-input_5 = MSstatsSelectFeatures(data.table::copy(input), "topN", top_n = 5) # top 5 features
-input_informative = MSstatsSelectFeatures(input, "highQuality") # feature selection
+input <- MSstatsPrepareForDataProcess(raw, 2, NULL)
+input <- MSstatsNormalize(input, "EQUALIZEMEDIANS")
+input <- MSstatsMergeFractions(input)
+input <- MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
+input_all <- MSstatsSelectFeatures(input, "all") # all features
+input_5 <- MSstatsSelectFeatures(data.table::copy(input), "topN", top_n = 5) # top 5 features
+input_informative <- MSstatsSelectFeatures(input, "highQuality") # feature selection
head(input_all)
head(input_5)
diff --git a/man/MSstatsSummarizationOutput.Rd b/man/MSstatsSummarizationOutput.Rd
index 280087c0..103404c5 100644
--- a/man/MSstatsSummarizationOutput.Rd
+++ b/man/MSstatsSummarizationOutput.Rd
@@ -26,13 +26,13 @@ MSstatsSummarizationOutput(
\item{impute}{if TRUE, censored missing values were imputed
(`MBimpute` parameter to `dataProcess`)}
-\item{censored_symbol}{censored missing value indicator
+\item{censored_symbol}{censored missing value indicator
(`censoredInt` parameter to `dataProcess`)}
}
\value{
list that consists of the following elements:
\itemize{
-\item{FeatureLevelData}{ - feature-level data after processing}
+\item{FeatureLevelData}{ - feature-level data after processing}
\item{ProteinLevelData}{ - protein-level (summarized) data}
\item{SummaryMethod}{ (string) - name of summarization method that was used}
}
@@ -41,20 +41,22 @@ list that consists of the following elements:
Post-processing output from MSstats summarization
}
\examples{
-raw = DDARawData
-method = "TMP"
-cens = "NA"
-impute = TRUE
+raw <- DDARawData
+method <- "TMP"
+cens <- "NA"
+impute <- TRUE
MSstatsConvert::MSstatsLogsSettings(FALSE)
-input = MSstatsPrepareForDataProcess(raw, 2, NULL)
-input = MSstatsNormalize(input, "EQUALIZEMEDIANS")
-input = MSstatsMergeFractions(input)
-input = MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
-input = MSstatsSelectFeatures(input, "all")
-processed = getProcessed(input)
-input = MSstatsPrepareForSummarization(input, method, impute, cens, FALSE)
-summarized = MSstatsSummarizeWithSingleCore(input, method, impute, cens, FALSE, TRUE)
-output = output = MSstatsSummarizationOutput(input, summarized, processed,
-method, impute, cens)
+input <- MSstatsPrepareForDataProcess(raw, 2, NULL)
+input <- MSstatsNormalize(input, "EQUALIZEMEDIANS")
+input <- MSstatsMergeFractions(input)
+input <- MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
+input <- MSstatsSelectFeatures(input, "all")
+processed <- getProcessed(input)
+input <- MSstatsPrepareForSummarization(input, method, impute, cens, FALSE)
+summarized <- MSstatsSummarizeWithSingleCore(input, method, impute, cens, FALSE, TRUE)
+output <- output <- MSstatsSummarizationOutput(
+ input, summarized, processed,
+ method, impute, cens
+)
}
diff --git a/man/MSstatsSummarizeSingleLinear.Rd b/man/MSstatsSummarizeSingleLinear.Rd
index 82dde780..700ac2b4 100644
--- a/man/MSstatsSummarizeSingleLinear.Rd
+++ b/man/MSstatsSummarizeSingleLinear.Rd
@@ -18,20 +18,20 @@ list with protein-level data
Linear model-based summarization for a single protein
}
\examples{
-raw = DDARawData
-method = "linear"
-cens = NULL
-impute = FALSE
+raw <- DDARawData
+method <- "linear"
+cens <- NULL
+impute <- FALSE
# currently, MSstats only supports MBimpute = FALSE for linear summarization
MSstatsConvert::MSstatsLogsSettings(FALSE)
-input = MSstatsPrepareForDataProcess(raw, 2, NULL)
-input = MSstatsNormalize(input, "EQUALIZEMEDIANS")
-input = MSstatsMergeFractions(input)
-input = MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
-input = MSstatsSelectFeatures(input, "all")
-input = MSstatsPrepareForSummarization(input, method, impute, cens, FALSE)
-input_split = split(input, input$PROTEIN)
-single_protein_summary = MSstatsSummarizeSingleLinear(input_split[[1]])
+input <- MSstatsPrepareForDataProcess(raw, 2, NULL)
+input <- MSstatsNormalize(input, "EQUALIZEMEDIANS")
+input <- MSstatsMergeFractions(input)
+input <- MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
+input <- MSstatsSelectFeatures(input, "all")
+input <- MSstatsPrepareForSummarization(input, method, impute, cens, FALSE)
+input_split <- split(input, input$PROTEIN)
+single_protein_summary <- MSstatsSummarizeSingleLinear(input_split[[1]])
head(single_protein_summary[[1]])
}
diff --git a/man/MSstatsSummarizeSingleTMP.Rd b/man/MSstatsSummarizeSingleTMP.Rd
index 9d656887..b592f4b8 100644
--- a/man/MSstatsSummarizeSingleTMP.Rd
+++ b/man/MSstatsSummarizeSingleTMP.Rd
@@ -14,18 +14,18 @@ MSstatsSummarizeSingleTMP(
\arguments{
\item{single_protein}{feature-level data for a single protein}
-\item{impute}{only for summaryMethod = "TMP" and censoredInt = 'NA' or '0'.
-TRUE (default) imputes 'NA' or '0' (depending on censoredInt option) by Accelated failure model.
+\item{impute}{only for summaryMethod = "TMP" and censoredInt = 'NA' or '0'.
+TRUE (default) imputes 'NA' or '0' (depending on censoredInt option) by Accelated failure model.
FALSE uses the values assigned by cutoffCensored}
-\item{censored_symbol}{Missing values are censored or at random.
-'NA' (default) assumes that all 'NA's in 'Intensity' column are censored.
-'0' uses zero intensities as censored intensity.
-In this case, NA intensities are missing at random.
-The output from Skyline should use '0'.
+\item{censored_symbol}{Missing values are censored or at random.
+'NA' (default) assumes that all 'NA's in 'Intensity' column are censored.
+'0' uses zero intensities as censored intensity.
+In this case, NA intensities are missing at random.
+The output from Skyline should use '0'.
Null assumes that all NA intensites are randomly missing.}
-\item{remove50missing}{only for summaryMethod = "TMP". TRUE removes the proteins
+\item{remove50missing}{only for summaryMethod = "TMP". TRUE removes the proteins
where every run has at least 50\% missing values for each peptide. FALSE is default.}
}
\value{
@@ -36,21 +36,23 @@ the other with protein-level data
Tukey Median Polish summarization for a single protein
}
\examples{
-raw = DDARawData
-method = "TMP"
-cens = "NA"
-impute = TRUE
+raw <- DDARawData
+method <- "TMP"
+cens <- "NA"
+impute <- TRUE
# currently, MSstats only supports MBimpute = FALSE for linear summarization
MSstatsConvert::MSstatsLogsSettings(FALSE)
-input = MSstatsPrepareForDataProcess(raw, 2, NULL)
-input = MSstatsNormalize(input, "EQUALIZEMEDIANS")
-input = MSstatsMergeFractions(input)
-input = MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
-input = MSstatsSelectFeatures(input, "all")
-input = MSstatsPrepareForSummarization(input, method, impute, cens, FALSE)
-input_split = split(input, input$PROTEIN)
-single_protein_summary = MSstatsSummarizeSingleTMP(input_split[[1]],
- impute, cens, FALSE)
+input <- MSstatsPrepareForDataProcess(raw, 2, NULL)
+input <- MSstatsNormalize(input, "EQUALIZEMEDIANS")
+input <- MSstatsMergeFractions(input)
+input <- MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
+input <- MSstatsSelectFeatures(input, "all")
+input <- MSstatsPrepareForSummarization(input, method, impute, cens, FALSE)
+input_split <- split(input, input$PROTEIN)
+single_protein_summary <- MSstatsSummarizeSingleTMP(
+ input_split[[1]],
+ impute, cens, FALSE
+)
head(single_protein_summary[[1]])
}
diff --git a/man/MSstatsSummarizeWithMultipleCores.Rd b/man/MSstatsSummarizeWithMultipleCores.Rd
index 7387110e..04363fa7 100644
--- a/man/MSstatsSummarizeWithMultipleCores.Rd
+++ b/man/MSstatsSummarizeWithMultipleCores.Rd
@@ -19,29 +19,29 @@ MSstatsSummarizeWithMultipleCores(
\item{method}{summarization method: "linear" or "TMP"}
-\item{impute}{only for summaryMethod = "TMP" and censoredInt = 'NA' or '0'.
-TRUE (default) imputes 'NA' or '0' (depending on censoredInt option) by Accelated failure model.
+\item{impute}{only for summaryMethod = "TMP" and censoredInt = 'NA' or '0'.
+TRUE (default) imputes 'NA' or '0' (depending on censoredInt option) by Accelated failure model.
FALSE uses the values assigned by cutoffCensored}
-\item{censored_symbol}{Missing values are censored or at random.
-'NA' (default) assumes that all 'NA's in 'Intensity' column are censored.
-'0' uses zero intensities as censored intensity.
-In this case, NA intensities are missing at random.
-The output from Skyline should use '0'.
+\item{censored_symbol}{Missing values are censored or at random.
+'NA' (default) assumes that all 'NA's in 'Intensity' column are censored.
+'0' uses zero intensities as censored intensity.
+In this case, NA intensities are missing at random.
+The output from Skyline should use '0'.
Null assumes that all NA intensites are randomly missing.}
-\item{remove50missing}{only for summaryMethod = "TMP". TRUE removes the proteins
+\item{remove50missing}{only for summaryMethod = "TMP". TRUE removes the proteins
where every run has at least 50\% missing values for each peptide. FALSE is default.}
-\item{equal_variance}{only for summaryMethod = "linear". Default is TRUE.
-Logical variable for whether the model should account for heterogeneous variation
+\item{equal_variance}{only for summaryMethod = "linear". Default is TRUE.
+Logical variable for whether the model should account for heterogeneous variation
among intensities from different features. Default is TRUE, which assume equal
-variance among intensities from features. FALSE means that we cannot assume
+variance among intensities from features. FALSE means that we cannot assume
equal variance among intensities from features, then we will account for
heterogeneous variation from different features.}
-\item{numberOfCores}{Number of cores for parallel processing. When > 1,
-a logfile named `MSstats_dataProcess_log_progress.log` is created to
+\item{numberOfCores}{Number of cores for parallel processing. When > 1,
+a logfile named `MSstats_dataProcess_log_progress.log` is created to
track progress. Only works for Linux & Mac OS. Default is 1.}
}
\value{
diff --git a/man/MSstatsSummarizeWithSingleCore.Rd b/man/MSstatsSummarizeWithSingleCore.Rd
index 84b64a8c..d18530a5 100644
--- a/man/MSstatsSummarizeWithSingleCore.Rd
+++ b/man/MSstatsSummarizeWithSingleCore.Rd
@@ -18,24 +18,24 @@ MSstatsSummarizeWithSingleCore(
\item{method}{summarization method: "linear" or "TMP"}
-\item{impute}{only for summaryMethod = "TMP" and censoredInt = 'NA' or '0'.
-TRUE (default) imputes 'NA' or '0' (depending on censoredInt option) by Accelated failure model.
+\item{impute}{only for summaryMethod = "TMP" and censoredInt = 'NA' or '0'.
+TRUE (default) imputes 'NA' or '0' (depending on censoredInt option) by Accelated failure model.
FALSE uses the values assigned by cutoffCensored}
-\item{censored_symbol}{Missing values are censored or at random.
-'NA' (default) assumes that all 'NA's in 'Intensity' column are censored.
-'0' uses zero intensities as censored intensity.
-In this case, NA intensities are missing at random.
-The output from Skyline should use '0'.
+\item{censored_symbol}{Missing values are censored or at random.
+'NA' (default) assumes that all 'NA's in 'Intensity' column are censored.
+'0' uses zero intensities as censored intensity.
+In this case, NA intensities are missing at random.
+The output from Skyline should use '0'.
Null assumes that all NA intensites are randomly missing.}
-\item{remove50missing}{only for summaryMethod = "TMP". TRUE removes the proteins
+\item{remove50missing}{only for summaryMethod = "TMP". TRUE removes the proteins
where every run has at least 50\% missing values for each peptide. FALSE is default.}
-\item{equal_variance}{only for summaryMethod = "linear". Default is TRUE.
-Logical variable for whether the model should account for heterogeneous variation
+\item{equal_variance}{only for summaryMethod = "linear". Default is TRUE.
+Logical variable for whether the model should account for heterogeneous variation
among intensities from different features. Default is TRUE, which assume equal
-variance among intensities from features. FALSE means that we cannot assume
+variance among intensities from features. FALSE means that we cannot assume
equal variance among intensities from features, then we will account for
heterogeneous variation from different features.}
}
@@ -46,19 +46,19 @@ list of length one with run-level data.
Feature-level data summarization with 1 core
}
\examples{
-raw = DDARawData
-method = "TMP"
-cens = "NA"
-impute = TRUE
+raw <- DDARawData
+method <- "TMP"
+cens <- "NA"
+impute <- TRUE
MSstatsConvert::MSstatsLogsSettings(FALSE)
-input = MSstatsPrepareForDataProcess(raw, 2, NULL)
-input = MSstatsNormalize(input, "EQUALIZEMEDIANS")
-input = MSstatsMergeFractions(input)
-input = MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
-input = MSstatsSelectFeatures(input, "all")
-processed = getProcessed(input)
-input = MSstatsPrepareForSummarization(input, method, impute, cens, FALSE)
-summarized = MSstatsSummarizeWithSingleCore(input, method, impute, cens, FALSE, TRUE)
+input <- MSstatsPrepareForDataProcess(raw, 2, NULL)
+input <- MSstatsNormalize(input, "EQUALIZEMEDIANS")
+input <- MSstatsMergeFractions(input)
+input <- MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
+input <- MSstatsSelectFeatures(input, "all")
+processed <- getProcessed(input)
+input <- MSstatsPrepareForSummarization(input, method, impute, cens, FALSE)
+summarized <- MSstatsSummarizeWithSingleCore(input, method, impute, cens, FALSE, TRUE)
length(summarized) # list of summarization outputs for each protein
head(summarized[[1]][[1]]) # run-level summary
diff --git a/man/SDRFtoAnnotation.Rd b/man/SDRFtoAnnotation.Rd
index b45f34bb..ab5d407a 100644
--- a/man/SDRFtoAnnotation.Rd
+++ b/man/SDRFtoAnnotation.Rd
@@ -15,35 +15,35 @@ SDRFtoAnnotation(
\arguments{
\item{data}{SDRF annotation file}
-\item{run_name}{Column name in SDRF file which contains the name of the MS
+\item{run_name}{Column name in SDRF file which contains the name of the MS
run. The information in this column must match exactly with the run names in
the PSM file}
-\item{condition_name}{Column name in SDRF file which contains information on
+\item{condition_name}{Column name in SDRF file which contains information on
the conditions in the data.}
-\item{biological_replicate}{Column name in SDRF file which contains the
-identifier for the biological replicte. Note MSstats uses this column to
-determine if the experiment is a repeated measure design. BioReplicte IDs
+\item{biological_replicate}{Column name in SDRF file which contains the
+identifier for the biological replicte. Note MSstats uses this column to
+determine if the experiment is a repeated measure design. BioReplicte IDs
should only be reused if the replicate was measured multiple times.}
-\item{fraction}{Column name in SDFT file which contains information on the
-fractionation in the data. Only required if data contains fractions. Default
+\item{fraction}{Column name in SDFT file which contains information on the
+fractionation in the data. Only required if data contains fractions. Default
is `NULL`}
}
\description{
Takes an SDRF file and outputs an MSstats annotation file. Note
-the information in the SDRF file must be correctly annotated for MSstats so
-that MSstats can identify the experimental design. In particular the
-biological replicates must be correctly annotated, with group comparison
-experiments having a unique ID for each BioReplicate. For more information
-on this please see the Supplementary of the most recent
+the information in the SDRF file must be correctly annotated for MSstats so
+that MSstats can identify the experimental design. In particular the
+biological replicates must be correctly annotated, with group comparison
+experiments having a unique ID for each BioReplicate. For more information
+on this please see the Supplementary of the most recent
\href{https://pubs.acs.org/doi/10.1021/acs.jproteome.2c00834}{MSstats paper}
}
\examples{
head(example_SDRF)
-msstats_annotation = SDRFtoAnnotation(example_SDRF)
+msstats_annotation <- SDRFtoAnnotation(example_SDRF)
head(msstats_annotation)
}
diff --git a/man/dataProcess.Rd b/man/dataProcess.Rd
index bb934741..1f4a38ff 100644
--- a/man/dataProcess.Rd
+++ b/man/dataProcess.Rd
@@ -32,30 +32,30 @@ dataProcess(
\item{logTrans}{base of logarithm transformation: 2 (default) or 10.}
-\item{normalization}{normalization to remove systematic bias between MS runs.
+\item{normalization}{normalization to remove systematic bias between MS runs.
There are three different normalizations supported:
-'equalizeMedians' (default) represents constant normalization (equalizing the medians)
-based on reference signals is performed.
-'quantile' represents quantile normalization based on reference signals
-'globalStandards' represents normalization with global standards proteins.
-If FALSE, no normalization is performed. See MSstats vignettes for
+'equalizeMedians' (default) represents constant normalization (equalizing the medians)
+based on reference signals is performed.
+'quantile' represents quantile normalization based on reference signals
+'globalStandards' represents normalization with global standards proteins.
+If FALSE, no normalization is performed. See MSstats vignettes for
recommendations on which normalization option to use.}
-\item{nameStandards}{optional vector of global standard peptide names.
+\item{nameStandards}{optional vector of global standard peptide names.
Required only for normalization with global standard peptides.}
-\item{featureSubset}{"all" (default) uses all features that the data set has.
-"top3" uses top 3 features which have highest average of log-intensity across runs.
-"topN" uses top N features which has highest average of log-intensity across runs.
-It needs the input for n_top_feature option.
-"highQuality" flags uninformative feature and outliers. See MSstats vignettes for
+\item{featureSubset}{"all" (default) uses all features that the data set has.
+"top3" uses top 3 features which have highest average of log-intensity across runs.
+"topN" uses top N features which has highest average of log-intensity across runs.
+It needs the input for n_top_feature option.
+"highQuality" flags uninformative feature and outliers. See MSstats vignettes for
recommendations on which feature selection option to use.}
-\item{remove_uninformative_feature_outlier}{optional. Only required if
-featureSubset = "highQuality". TRUE allows to remove
+\item{remove_uninformative_feature_outlier}{optional. Only required if
+featureSubset = "highQuality". TRUE allows to remove
1) noisy features (flagged in the column feature_quality with "Uninformative"),
-2) outliers (flagged in the column, is_outlier with TRUE,
-before run-level summarization. FALSE (default) uses all features and intensities
+2) outliers (flagged in the column, is_outlier with TRUE,
+before run-level summarization. FALSE (default) uses all features and intensities
for run-level summarization.}
\item{min_feature_count}{optional. Only required if featureSubset = "highQuality".
@@ -66,30 +66,30 @@ in the feature selection algorithm.}
It that case, it specifies number of top features that will be used.
Default is 3, which means to use top 3 features.}
-\item{summaryMethod}{"TMP" (default) means Tukey's median polish,
+\item{summaryMethod}{"TMP" (default) means Tukey's median polish,
which is robust estimation method. "linear" uses linear mixed model.}
-\item{equalFeatureVar}{only for summaryMethod = "linear". default is TRUE.
-Logical variable for whether the model should account for heterogeneous variation
-among intensities from different features. Default is TRUE, which assume equal
-variance among intensities from features. FALSE means that we cannot assume equal
-variance among intensities from features, then we will account for heterogeneous
+\item{equalFeatureVar}{only for summaryMethod = "linear". default is TRUE.
+Logical variable for whether the model should account for heterogeneous variation
+among intensities from different features. Default is TRUE, which assume equal
+variance among intensities from features. FALSE means that we cannot assume equal
+variance among intensities from features, then we will account for heterogeneous
variation from different features.}
-\item{censoredInt}{Missing values are censored or at random.
-'NA' (default) assumes that all 'NA's in 'Intensity' column are censored.
-'0' uses zero intensities as censored intensity.
-In this case, NA intensities are missing at random.
-The output from Skyline should use '0'.
+\item{censoredInt}{Missing values are censored or at random.
+'NA' (default) assumes that all 'NA's in 'Intensity' column are censored.
+'0' uses zero intensities as censored intensity.
+In this case, NA intensities are missing at random.
+The output from Skyline should use '0'.
Null assumes that all NA intensites are randomly missing.}
-\item{MBimpute}{only for summaryMethod = "TMP" and censoredInt = 'NA' or '0'.
-TRUE (default) imputes missing values with 'NA' or '0' (depending on censoredInt option)
-by Accelerated failure model. If set to FALSE, no missing values are imputed.
+\item{MBimpute}{only for summaryMethod = "TMP" and censoredInt = 'NA' or '0'.
+TRUE (default) imputes missing values with 'NA' or '0' (depending on censoredInt option)
+by Accelerated failure model. If set to FALSE, no missing values are imputed.
FALSE is appropriate only when missingness is assumed to be at random.
See MSstats vignettes for recommendations on which imputation option to use.}
-\item{remove50missing}{only for summaryMethod = "TMP". TRUE removes the proteins
+\item{remove50missing}{only for summaryMethod = "TMP". TRUE removes the proteins
where every run has at least 50\% missing values for each peptide. FALSE is default.}
\item{fix_missing}{Optional, same as the `fix_missing` parameter in MSstatsConvert::MSstatsBalancedDesign function}
@@ -105,13 +105,13 @@ to an existing log file.}
\item{verbose}{logical. If TRUE, information about data processing wil be printed
to the console.}
-\item{log_file_path}{character. Path to a file to which information about
-data processing will be saved.
+\item{log_file_path}{character. Path to a file to which information about
+data processing will be saved.
If not provided, such a file will be created automatically.
If `append = TRUE`, has to be a valid path to a file.}
-\item{numberOfCores}{Number of cores for parallel processing. When > 1,
-a logfile named `MSstats_dataProcess_log_progress.log` is created to
+\item{numberOfCores}{Number of cores for parallel processing. When > 1,
+a logfile named `MSstats_dataProcess_log_progress.log` is created to
track progress. Only works for Linux & Mac OS. Default is 1.}
}
\value{
@@ -119,6 +119,7 @@ A list containing:
\describe{
\item{FeatureLevelData}{A data frame with feature-level information after processing. Columns include:
\describe{
+ \item{PROTEIN}{Identifier for the protein associated with the feature.}
\item{PROTEIN}{Identifier for the protein associated with the feature.}
\item{PEPTIDE}{Identifier for the peptide sequence.}
\item{TRANSITION}{Identifier for the transition, typically representing a specific ion pair.}
@@ -163,13 +164,13 @@ Process MS data: clean, normalize and summarize before differential analysis
# across time points.
head(SRMRawData)
# Log2 transformation and normalization are applied (default)
-QuantData<-dataProcess(SRMRawData, use_log_file = FALSE)
+QuantData <- dataProcess(SRMRawData, use_log_file = FALSE)
head(QuantData$FeatureLevelData)
# Log10 transformation and normalization are applied
-QuantData1<-dataProcess(SRMRawData, logTrans=10, use_log_file = FALSE)
+QuantData1 <- dataProcess(SRMRawData, logTrans = 10, use_log_file = FALSE)
head(QuantData1$FeatureLevelData)
# Log2 transformation and no normalization are applied
-QuantData2<-dataProcess(SRMRawData,normalization=FALSE, use_log_file = FALSE)
+QuantData2 <- dataProcess(SRMRawData, normalization = FALSE, use_log_file = FALSE)
head(QuantData2$FeatureLevelData)
}
diff --git a/man/dataProcessPlots.Rd b/man/dataProcessPlots.Rd
index 2d1f08ca..9f7f4496 100644
--- a/man/dataProcessPlots.Rd
+++ b/man/dataProcessPlots.Rd
@@ -33,43 +33,43 @@ dataProcessPlots(
\arguments{
\item{data}{name of the (output of dataProcess function) data set.}
-\item{type}{choice of visualization. "ProfilePlot" represents profile plot of
-log intensities across MS runs. "QCPlot" represents quality control plot of
-log intensities across MS runs. "ConditionPlot" represents mean plot of log
+\item{type}{choice of visualization. "ProfilePlot" represents profile plot of
+log intensities across MS runs. "QCPlot" represents quality control plot of
+log intensities across MS runs. "ConditionPlot" represents mean plot of log
ratios (Light/Heavy) across conditions.}
-\item{featureName}{for "ProfilePlot" only, "Transition" (default) means
-printing feature legend in transition-level; "Peptide" means printing feature
+\item{featureName}{for "ProfilePlot" only, "Transition" (default) means
+printing feature legend in transition-level; "Peptide" means printing feature
legend in peptide-level; "NA" means no feature legend printing.}
-\item{ylimUp}{upper limit for y-axis in the log scale. FALSE(Default) for
-Profile Plot and QC Plot use the upper limit as rounded off maximum of
+\item{ylimUp}{upper limit for y-axis in the log scale. FALSE(Default) for
+Profile Plot and QC Plot use the upper limit as rounded off maximum of
log2(intensities) after normalization + 3. FALSE(Default) for Condition Plot
is maximum of log ratio + SD or CI.}
-\item{ylimDown}{lower limit for y-axis in the log scale. FALSE(Default) for
+\item{ylimDown}{lower limit for y-axis in the log scale. FALSE(Default) for
Profile Plot and QC Plot is 0. FALSE(Default) for Condition Plot is minumum
of log ratio - SD or CI.}
-\item{scale}{for "ConditionPlot" only, FALSE(default) means each conditional
-level is not scaled at x-axis according to its actual value (equal space at
+\item{scale}{for "ConditionPlot" only, FALSE(default) means each conditional
+level is not scaled at x-axis according to its actual value (equal space at
x-axis). TRUE means each conditional level is scaled at x-axis according to
its actual value (unequal space at x-axis).}
-\item{interval}{for "ConditionPlot" only, "CI"(default) uses confidence
-interval with 0.95 significant level for the width of error bar.
+\item{interval}{for "ConditionPlot" only, "CI"(default) uses confidence
+interval with 0.95 significant level for the width of error bar.
"SD" uses standard deviation for the width of error bar.}
-\item{x.axis.size}{size of x-axis labeling for "Run" in Profile Plot and
+\item{x.axis.size}{size of x-axis labeling for "Run" in Profile Plot and
QC Plot, and "Condition" in Condition Plot. Default is 10.}
\item{y.axis.size}{size of y-axis labels. Default is 10.}
-\item{text.size}{size of labels represented each condition at the top of
+\item{text.size}{size of labels represented each condition at the top of
graph in Profile Plot and QC plot. Default is 4.}
\item{text.angle}{angle of labels represented each condition at the top
-of graph in Profile Plot and QC plot or x-axis labeling in Condition plot.
+of graph in Profile Plot and QC plot or x-axis labeling in Condition plot.
Default is 0.}
\item{legend.size}{size of feature legend (transition-level or peptide-level)
@@ -85,47 +85,47 @@ above graph in Profile Plot. Default is 7.}
\item{which.Protein}{Protein list to draw plots. List can be names of Proteins
or order numbers of Proteins from levels(data$FeatureLevelData$PROTEIN).
-Default is "all", which generates all plots for each protein.
+Default is "all", which generates all plots for each protein.
For QC plot, "allonly" will generate one QC plot with all proteins.}
\item{originalPlot}{TRUE(default) draws original profile plots.}
-\item{summaryPlot}{TRUE(default) draws profile plots with
+\item{summaryPlot}{TRUE(default) draws profile plots with
summarization for run levels.}
-\item{save_condition_plot_result}{TRUE saves the table with values
+\item{save_condition_plot_result}{TRUE saves the table with values
using condition plots. Default is FALSE.}
-\item{remove_uninformative_feature_outlier}{It only works after users used
+\item{remove_uninformative_feature_outlier}{It only works after users used
featureSubset="highQuality" in dataProcess. TRUE allows to remove
-1) the features are flagged in the column, feature_quality="Uninformative"
-which are features with bad quality,
-2) outliers that are flagged in the column, is_outlier=TRUE in Profile plots.
+1) the features are flagged in the column, feature_quality="Uninformative"
+which are features with bad quality,
+2) outliers that are flagged in the column, is_outlier=TRUE in Profile plots.
FALSE (default) shows all features and intensities in profile plots.}
\item{address}{prefix for the filename that will store the results.}
-\item{isPlotly}{Parameter to use Plotly or ggplot2. If set to TRUE, MSstats
+\item{isPlotly}{Parameter to use Plotly or ggplot2. If set to TRUE, MSstats
will save Plotly plots as HTML files. If set to FALSE MSstats will save ggplot2 plots
as PDF files
-Default folder is the current working directory.
+Default folder is the current working directory.
The other assigned folder has to be existed under the current working directory.
- An output pdf file is automatically created with the default name of
- "ProfilePlot.pdf" or "QCplot.pdf" or "ConditionPlot.pdf" or "ConditionPlot_value.csv".
- The command address can help to specify where to store the file as well as
- how to modify the beginning of the file name.
+ An output pdf file is automatically created with the default name of
+ "ProfilePlot.pdf" or "QCplot.pdf" or "ConditionPlot.pdf" or "ConditionPlot_value.csv".
+ The command address can help to specify where to store the file as well as
+ how to modify the beginning of the file name.
If address=FALSE, plot will be not saved as pdf file but showed in window.}
}
\description{
-To illustrate the quantitative data after data-preprocessing and
-quality control of MS runs, dataProcessPlots takes the quantitative data from
-function (\code{\link{dataProcess}}) as input and automatically generate
-three types of figures in pdf files as output :
-(1) profile plot (specify "ProfilePlot" in option type),
-to identify the potential sources of variation for each protein;
-(2) quality control plot (specify "QCPlot" in option type),
-to evaluate the systematic bias between MS runs;
-(3) mean plot for conditions (specify "ConditionPlot" in option type),
+To illustrate the quantitative data after data-preprocessing and
+quality control of MS runs, dataProcessPlots takes the quantitative data from
+function (\code{\link{dataProcess}}) as input and automatically generate
+three types of figures in pdf files as output :
+(1) profile plot (specify "ProfilePlot" in option type),
+to identify the potential sources of variation for each protein;
+(2) quality control plot (specify "QCPlot" in option type),
+to evaluate the systematic bias between MS runs;
+(3) mean plot for conditions (specify "ConditionPlot" in option type),
to illustrate mean and variability of each condition per protein.
}
\details{
@@ -137,20 +137,20 @@ to illustrate mean and variability of each condition per protein.
The input of this function is the quantitative data from function \code{\link{dataProcess}}.
}
\examples{
-# Consider quantitative data (i.e. QuantData) from a yeast study with ten time points of interests,
-# three biological replicates, and no technical replicates which is a time-course experiment.
-# The goal is to provide pre-analysis visualization by automatically generate two types of figures
-# in two separate pdf files.
-# Protein IDHC (gene name IDP2) is differentially expressed in time point 1 and time point 7,
+# Consider quantitative data (i.e. QuantData) from a yeast study with ten time points of interests,
+# three biological replicates, and no technical replicates which is a time-course experiment.
+# The goal is to provide pre-analysis visualization by automatically generate two types of figures
+# in two separate pdf files.
+# Protein IDHC (gene name IDP2) is differentially expressed in time point 1 and time point 7,
# whereas, Protein PMG2 (gene name GPM2) is not.
-QuantData<-dataProcess(SRMRawData, use_log_file = FALSE)
+QuantData <- dataProcess(SRMRawData, use_log_file = FALSE)
head(QuantData$FeatureLevelData)
# Profile plot
-dataProcessPlots(data=QuantData,type="ProfilePlot")
-# Quality control plot
-dataProcessPlots(data=QuantData,type="QCPlot")
+dataProcessPlots(data = QuantData, type = "ProfilePlot")
+# Quality control plot
+dataProcessPlots(data = QuantData, type = "QCPlot")
# Quantification plot for conditions
-dataProcessPlots(data=QuantData,type="ConditionPlot")
+dataProcessPlots(data = QuantData, type = "ConditionPlot")
}
diff --git a/man/designSampleSize.Rd b/man/designSampleSize.Rd
index 58e2230d..121639af 100644
--- a/man/designSampleSize.Rd
+++ b/man/designSampleSize.Rd
@@ -19,18 +19,18 @@ designSampleSize(
\arguments{
\item{data}{'FittedModel' in testing output from function groupComparison.}
-\item{desiredFC}{the range of a desired fold change which includes the lower
+\item{desiredFC}{the range of a desired fold change which includes the lower
and upper values of the desired fold change.}
-\item{FDR}{a pre-specified false discovery ratio (FDR) to control the overall
+\item{FDR}{a pre-specified false discovery ratio (FDR) to control the overall
false positive rate. Default is 0.05}
-\item{numSample}{minimal number of biological replicates per condition.
-TRUE represents you require to calculate the sample size for this category,
+\item{numSample}{minimal number of biological replicates per condition.
+TRUE represents you require to calculate the sample size for this category,
else you should input the exact number of biological replicates.}
-\item{power}{a pre-specified statistical power which defined as the probability
-of detecting a true fold change. TRUE represent you require to calculate the power
+\item{power}{a pre-specified statistical power which defined as the probability
+of detecting a true fold change. TRUE represent you require to calculate the power
for this category, else you should input the average of power you expect. Default is 0.9}
\item{use_log_file}{logical. If TRUE, information about data processing
@@ -42,8 +42,8 @@ to an existing log file.}
\item{verbose}{logical. If TRUE, information about data processing wil be printed
to the console.}
-\item{log_file_path}{character. Path to a file to which information about
-data processing will be saved.
+\item{log_file_path}{character. Path to a file to which information about
+data processing will be saved.
If not provided, such a file will be created automatically.
If `append = TRUE`, has to be a valid path to a file.}
}
@@ -52,17 +52,17 @@ data.frame - sample size calculation results including varibles:
desiredFC, numSample, FDR, and power.
}
\description{
-Calculate sample size for future experiments of a Selected Reaction Monitoring (SRM),
-Data-Dependent Acquisition (DDA or shotgun), and Data-Independent Acquisition (DIA or SWATH-MS) experiment
-based on intensity-based linear model. Two options of the calculation:
-(1) number of biological replicates per condition,
+Calculate sample size for future experiments of a Selected Reaction Monitoring (SRM),
+Data-Dependent Acquisition (DDA or shotgun), and Data-Independent Acquisition (DIA or SWATH-MS) experiment
+based on intensity-based linear model. Two options of the calculation:
+(1) number of biological replicates per condition,
(2) power.
}
\details{
-The function fits the model and uses variance components to calculate
-sample size. The underlying model fitting with intensity-based linear model with
+The function fits the model and uses variance components to calculate
+sample size. The underlying model fitting with intensity-based linear model with
technical MS run replication. Estimated sample size is rounded to 0 decimal.
-The function can only obtain either one of the categories of the sample size
+The function can only obtain either one of the categories of the sample size
calculation (numSample, numPep, numTran, power) at the same time.
}
\examples{
@@ -71,23 +71,27 @@ calculation (numSample, numPep, numTran, power) at the same time.
QuantData <- dataProcess(SRMRawData)
head(QuantData$FeatureLevelData)
## based on multiple comparisons (T1 vs T3; T1 vs T7; T1 vs T9)
-comparison1<-matrix(c(-1,0,1,0,0,0,0,0,0,0),nrow=1)
-comparison2<-matrix(c(-1,0,0,0,0,0,1,0,0,0),nrow=1)
-comparison3<-matrix(c(-1,0,0,0,0,0,0,0,1,0),nrow=1)
-comparison<-rbind(comparison1,comparison2, comparison3)
-row.names(comparison)<-c("T3-T1","T7-T1","T9-T1")
-colnames(comparison)<-unique(QuantData$ProteinLevelData$GROUP)
+comparison1 <- matrix(c(-1, 0, 1, 0, 0, 0, 0, 0, 0, 0), nrow = 1)
+comparison2 <- matrix(c(-1, 0, 0, 0, 0, 0, 1, 0, 0, 0), nrow = 1)
+comparison3 <- matrix(c(-1, 0, 0, 0, 0, 0, 0, 0, 1, 0), nrow = 1)
+comparison <- rbind(comparison1, comparison2, comparison3)
+row.names(comparison) <- c("T3-T1", "T7-T1", "T9-T1")
+colnames(comparison) <- unique(QuantData$ProteinLevelData$GROUP)
-testResultMultiComparisons<-groupComparison(contrast.matrix=comparison,data=QuantData)
+testResultMultiComparisons <- groupComparison(contrast.matrix = comparison, data = QuantData)
## Calculate sample size for future experiments:
-#(1) Minimal number of biological replicates per condition
-designSampleSize(data=testResultMultiComparisons$FittedModel, numSample=TRUE,
- desiredFC=c(1.25,1.75), FDR=0.05, power=0.8)
-#(2) Power calculation
-designSampleSize(data=testResultMultiComparisons$FittedModel, numSample=2,
- desiredFC=c(1.25,1.75), FDR=0.05, power=TRUE)
-
+# (1) Minimal number of biological replicates per condition
+designSampleSize(
+ data = testResultMultiComparisons$FittedModel, numSample = TRUE,
+ desiredFC = c(1.25, 1.75), FDR = 0.05, power = 0.8
+)
+# (2) Power calculation
+designSampleSize(
+ data = testResultMultiComparisons$FittedModel, numSample = 2,
+ desiredFC = c(1.25, 1.75), FDR = 0.05, power = TRUE
+)
+
}
\author{
Meena Choi, Ching-Yun Chang, Olga Vitek.
diff --git a/man/designSampleSizePlots.Rd b/man/designSampleSizePlots.Rd
index 743b6717..ad703427 100644
--- a/man/designSampleSizePlots.Rd
+++ b/man/designSampleSizePlots.Rd
@@ -9,7 +9,7 @@ designSampleSizePlots(data, isPlotly = FALSE)
\arguments{
\item{data}{output from function designSampleSize.}
-\item{isPlotly}{Parameter to use Plotly or ggplot2. If set to TRUE, MSstats
+\item{isPlotly}{Parameter to use Plotly or ggplot2. If set to TRUE, MSstats
will save Plotly plots as HTML files. If set to FALSE MSstats will save ggplot2 plots
as PDF files}
}
@@ -17,10 +17,10 @@ as PDF files}
Plot for estimated sample size with assigned variable.
}
\description{
-To illustrate the relationship of desired fold change and the calculated
-minimal number sample size which are (1) number of biological replicates per condition,
-(2) number of peptides per protein,
-(3) number of transitions per peptide, and
+To illustrate the relationship of desired fold change and the calculated
+minimal number sample size which are (1) number of biological replicates per condition,
+(2) number of peptides per protein,
+(3) number of transitions per peptide, and
(4) power. The input is the result from function (\code{\link{designSampleSize}}.
}
\details{
@@ -28,29 +28,33 @@ Data in the example is based on the results of sample size calculation from func
}
\examples{
# Based on the results of sample size calculation from function designSampleSize,
-# we generate a series of sample size plots for number of biological replicates, or peptides,
+# we generate a series of sample size plots for number of biological replicates, or peptides,
# or transitions or power plot.
-QuantData<-dataProcess(SRMRawData)
+QuantData <- dataProcess(SRMRawData)
head(QuantData$ProcessedData)
## based on multiple comparisons (T1 vs T3; T1 vs T7; T1 vs T9)
-comparison1<-matrix(c(-1,0,1,0,0,0,0,0,0,0),nrow=1)
-comparison2<-matrix(c(-1,0,0,0,0,0,1,0,0,0),nrow=1)
-comparison3<-matrix(c(-1,0,0,0,0,0,0,0,1,0),nrow=1)
-comparison<-rbind(comparison1,comparison2, comparison3)
-row.names(comparison)<-c("T3-T1","T7-T1","T9-T1")
-colnames(comparison)<-unique(QuantData$ProteinLevelData$GROUP)
+comparison1 <- matrix(c(-1, 0, 1, 0, 0, 0, 0, 0, 0, 0), nrow = 1)
+comparison2 <- matrix(c(-1, 0, 0, 0, 0, 0, 1, 0, 0, 0), nrow = 1)
+comparison3 <- matrix(c(-1, 0, 0, 0, 0, 0, 0, 0, 1, 0), nrow = 1)
+comparison <- rbind(comparison1, comparison2, comparison3)
+row.names(comparison) <- c("T3-T1", "T7-T1", "T9-T1")
+colnames(comparison) <- unique(QuantData$ProteinLevelData$GROUP)
-testResultMultiComparisons<-groupComparison(contrast.matrix=comparison, data=QuantData)
+testResultMultiComparisons <- groupComparison(contrast.matrix = comparison, data = QuantData)
# plot the calculated sample sizes for future experiments:
# (1) Minimal number of biological replicates per condition
-result.sample<-designSampleSize(data=testResultMultiComparisons$FittedModel, numSample=TRUE,
- desiredFC=c(1.25,1.75), FDR=0.05, power=0.8)
-designSampleSizePlots(data=result.sample)
+result.sample <- designSampleSize(
+ data = testResultMultiComparisons$FittedModel, numSample = TRUE,
+ desiredFC = c(1.25, 1.75), FDR = 0.05, power = 0.8
+)
+designSampleSizePlots(data = result.sample)
# (2) Power
-result.power<-designSampleSize(data=testResultMultiComparisons$FittedModel, numSample=2,
- desiredFC=c(1.25,1.75), FDR=0.05, power=TRUE)
-designSampleSizePlots(data=result.power)
+result.power <- designSampleSize(
+ data = testResultMultiComparisons$FittedModel, numSample = 2,
+ desiredFC = c(1.25, 1.75), FDR = 0.05, power = TRUE
+)
+designSampleSizePlots(data = result.power)
}
\author{
diff --git a/man/dot-calculatePower.Rd b/man/dot-calculatePower.Rd
index 61bf5b9f..29771277 100644
--- a/man/dot-calculatePower.Rd
+++ b/man/dot-calculatePower.Rd
@@ -14,10 +14,10 @@
)
}
\arguments{
-\item{desiredFC}{the range of a desired fold change which includes the lower
+\item{desiredFC}{the range of a desired fold change which includes the lower
and upper values of the desired fold change.}
-\item{FDR}{a pre-specified false discovery ratio (FDR) to control the overall
+\item{FDR}{a pre-specified false discovery ratio (FDR) to control the overall
false positive rate. Default is 0.05}
\item{delta}{difference between means (?)}
@@ -26,8 +26,8 @@ false positive rate. Default is 0.05}
\item{median_sigma_subject}{median standard deviation per subject}
-\item{numSample}{minimal number of biological replicates per condition.
-TRUE represents you require to calculate the sample size for this category,
+\item{numSample}{minimal number of biological replicates per condition.
+TRUE represents you require to calculate the sample size for this category,
else you should input the exact number of biological replicates.}
}
\description{
diff --git a/man/dot-documentFunction.Rd b/man/dot-documentFunction.Rd
index b187044e..7bed2a74 100644
--- a/man/dot-documentFunction.Rd
+++ b/man/dot-documentFunction.Rd
@@ -9,7 +9,7 @@
\arguments{
\item{removeFewMeasurements}{TRUE (default) will remove the features that have 1 or 2 measurements across runs.}
-\item{useUniquePeptide}{TRUE (default) removes peptides that are assigned for more than one proteins.
+\item{useUniquePeptide}{TRUE (default) removes peptides that are assigned for more than one proteins.
We assume to use unique peptide for each protein.}
\item{summaryforMultipleRows}{max(default) or sum - when there are multiple measurements for certain feature and certain run, use highest or sum of multiple intensities.}
@@ -31,8 +31,8 @@ to an existing log file.}
\item{verbose}{logical. If TRUE, information about data processing wil be printed
to the console.}
-\item{log_file_path}{character. Path to a file to which information about
-data processing will be saved.
+\item{log_file_path}{character. Path to a file to which information about
+data processing will be saved.
If not provided, such a file will be created automatically.
If `append = TRUE`, has to be a valid path to a file.}
}
diff --git a/man/dot-getNonMissingFilterStats.Rd b/man/dot-getNonMissingFilterStats.Rd
index e2c59980..8fc86f9d 100644
--- a/man/dot-getNonMissingFilterStats.Rd
+++ b/man/dot-getNonMissingFilterStats.Rd
@@ -9,11 +9,11 @@
\arguments{
\item{input}{data.table with data for a single protein}
-\item{censored_symbol}{Missing values are censored or at random.
-'NA' (default) assumes that all 'NA's in 'Intensity' column are censored.
-'0' uses zero intensities as censored intensity.
-In this case, NA intensities are missing at random.
-The output from Skyline should use '0'.
+\item{censored_symbol}{Missing values are censored or at random.
+'NA' (default) assumes that all 'NA's in 'Intensity' column are censored.
+'0' uses zero intensities as censored intensity.
+In this case, NA intensities are missing at random.
+The output from Skyline should use '0'.
Null assumes that all NA intensites are randomly missing.}
}
\value{
diff --git a/man/dot-getNumSample.Rd b/man/dot-getNumSample.Rd
index f294fbdb..b4a9d089 100644
--- a/man/dot-getNumSample.Rd
+++ b/man/dot-getNumSample.Rd
@@ -14,11 +14,11 @@
)
}
\arguments{
-\item{desiredFC}{the range of a desired fold change which includes the lower
+\item{desiredFC}{the range of a desired fold change which includes the lower
and upper values of the desired fold change.}
-\item{power}{a pre-specified statistical power which defined as the probability
-of detecting a true fold change. TRUE represent you require to calculate the power
+\item{power}{a pre-specified statistical power which defined as the probability
+of detecting a true fold change. TRUE represent you require to calculate the power
for this category, else you should input the average of power you expect. Default is 0.9}
\item{alpha}{significance level}
diff --git a/man/dot-groupComparisonWithMultipleCores.Rd b/man/dot-groupComparisonWithMultipleCores.Rd
index f4ae9ea1..3a3783e7 100644
--- a/man/dot-groupComparisonWithMultipleCores.Rd
+++ b/man/dot-groupComparisonWithMultipleCores.Rd
@@ -24,8 +24,8 @@
\item{samples_info}{data.table, output of getSamplesInfo function}
-\item{numberOfCores}{Number of cores for parallel processing.
-A logfile named `MSstats_groupComparison_log_progress.log` is created to
+\item{numberOfCores}{Number of cores for parallel processing.
+A logfile named `MSstats_groupComparison_log_progress.log` is created to
track progress. Only works for Linux & Mac OS.}
}
\description{
diff --git a/man/dot-makeConditionPlot.Rd b/man/dot-makeConditionPlot.Rd
index dbc11a37..9be1dc0c 100644
--- a/man/dot-makeConditionPlot.Rd
+++ b/man/dot-makeConditionPlot.Rd
@@ -22,23 +22,23 @@
\arguments{
\item{input}{data.table}
-\item{scale}{for "ConditionPlot" only, FALSE(default) means each conditional
-level is not scaled at x-axis according to its actual value (equal space at
+\item{scale}{for "ConditionPlot" only, FALSE(default) means each conditional
+level is not scaled at x-axis according to its actual value (equal space at
x-axis). TRUE means each conditional level is scaled at x-axis according to
its actual value (unequal space at x-axis).}
\item{single_protein}{data.table}
-\item{x.axis.size}{size of x-axis labeling for "Run" in Profile Plot and
+\item{x.axis.size}{size of x-axis labeling for "Run" in Profile Plot and
QC Plot, and "Condition" in Condition Plot. Default is 10.}
\item{y.axis.size}{size of y-axis labels. Default is 10.}
-\item{text.size}{size of labels represented each condition at the top of
+\item{text.size}{size of labels represented each condition at the top of
graph in Profile Plot and QC plot. Default is 4.}
\item{text.angle}{angle of labels represented each condition at the top
-of graph in Profile Plot and QC plot or x-axis labeling in Condition plot.
+of graph in Profile Plot and QC plot or x-axis labeling in Condition plot.
Default is 0.}
\item{legend.size}{size of feature legend (transition-level or peptide-level)
diff --git a/man/dot-makeProfilePlot.Rd b/man/dot-makeProfilePlot.Rd
index c259fa28..ca154c0c 100644
--- a/man/dot-makeProfilePlot.Rd
+++ b/man/dot-makeProfilePlot.Rd
@@ -30,20 +30,20 @@
\item{is_censored}{TRUE if censored values were imputed}
-\item{featureName}{for "ProfilePlot" only, "Transition" (default) means
-printing feature legend in transition-level; "Peptide" means printing feature
+\item{featureName}{for "ProfilePlot" only, "Transition" (default) means
+printing feature legend in transition-level; "Peptide" means printing feature
legend in peptide-level; "NA" means no feature legend printing.}
-\item{x.axis.size}{size of x-axis labeling for "Run" in Profile Plot and
+\item{x.axis.size}{size of x-axis labeling for "Run" in Profile Plot and
QC Plot, and "Condition" in Condition Plot. Default is 10.}
\item{y.axis.size}{size of y-axis labels. Default is 10.}
-\item{text.size}{size of labels represented each condition at the top of
+\item{text.size}{size of labels represented each condition at the top of
graph in Profile Plot and QC plot. Default is 4.}
\item{text.angle}{angle of labels represented each condition at the top
-of graph in Profile Plot and QC plot or x-axis labeling in Condition plot.
+of graph in Profile Plot and QC plot or x-axis labeling in Condition plot.
Default is 0.}
\item{legend.size}{size of feature legend (transition-level or peptide-level)
diff --git a/man/dot-makeQCPlot.Rd b/man/dot-makeQCPlot.Rd
index 98b17d6c..d9992846 100644
--- a/man/dot-makeQCPlot.Rd
+++ b/man/dot-makeQCPlot.Rd
@@ -26,31 +26,31 @@
\item{all_proteins}{character vector of protein names}
-\item{x.axis.size}{size of x-axis labeling for "Run" in Profile Plot and
+\item{x.axis.size}{size of x-axis labeling for "Run" in Profile Plot and
QC Plot, and "Condition" in Condition Plot. Default is 10.}
\item{y.axis.size}{size of y-axis labels. Default is 10.}
-\item{text.size}{size of labels represented each condition at the top of
+\item{text.size}{size of labels represented each condition at the top of
graph in Profile Plot and QC plot. Default is 4.}
\item{text.angle}{angle of labels represented each condition at the top
-of graph in Profile Plot and QC plot or x-axis labeling in Condition plot.
+of graph in Profile Plot and QC plot or x-axis labeling in Condition plot.
Default is 0.}
\item{legend.size}{size of feature legend (transition-level or peptide-level)
above graph in Profile Plot. Default is 7.}
}
\description{
-To illustrate the quantitative data after data-preprocessing and
-quality control of MS runs, dataProcessPlots takes the quantitative data from
-function (\code{\link{dataProcess}}) as input and automatically generate
-three types of figures in pdf files as output :
-(1) profile plot (specify "ProfilePlot" in option type),
-to identify the potential sources of variation for each protein;
-(2) quality control plot (specify "QCPlot" in option type),
-to evaluate the systematic bias between MS runs;
-(3) mean plot for conditions (specify "ConditionPlot" in option type),
+To illustrate the quantitative data after data-preprocessing and
+quality control of MS runs, dataProcessPlots takes the quantitative data from
+function (\code{\link{dataProcess}}) as input and automatically generate
+three types of figures in pdf files as output :
+(1) profile plot (specify "ProfilePlot" in option type),
+to identify the potential sources of variation for each protein;
+(2) quality control plot (specify "QCPlot" in option type),
+to evaluate the systematic bias between MS runs;
+(3) mean plot for conditions (specify "ConditionPlot" in option type),
to illustrate mean and variability of each condition per protein.
}
\details{
@@ -62,21 +62,21 @@ to illustrate mean and variability of each condition per protein.
The input of this function is the quantitative data from function \code{\link{dataProcess}}.
}
\examples{
-# Consider quantitative data (i.e. QuantData) from a yeast study with ten time points of interests,
-# three biological replicates, and no technical replicates which is a time-course experiment.
-# The goal is to provide pre-analysis visualization by automatically generate two types of figures
-# in two separate pdf files.
-# Protein IDHC (gene name IDP2) is differentially expressed in time point 1 and time point 7,
+# Consider quantitative data (i.e. QuantData) from a yeast study with ten time points of interests,
+# three biological replicates, and no technical replicates which is a time-course experiment.
+# The goal is to provide pre-analysis visualization by automatically generate two types of figures
+# in two separate pdf files.
+# Protein IDHC (gene name IDP2) is differentially expressed in time point 1 and time point 7,
# whereas, Protein PMG2 (gene name GPM2) is not.
-QuantData<-dataProcess(SRMRawData, use_log_file = FALSE)
+QuantData <- dataProcess(SRMRawData, use_log_file = FALSE)
head(QuantData$FeatureLevelData)
# Profile plot
-dataProcessPlots(data=QuantData,type="ProfilePlot")
-# Quality control plot
-dataProcessPlots(data=QuantData,type="QCPlot")
+dataProcessPlots(data = QuantData, type = "ProfilePlot")
+# Quality control plot
+dataProcessPlots(data = QuantData, type = "QCPlot")
# Quantification plot for conditions
-dataProcessPlots(data=QuantData,type="ConditionPlot")
+dataProcessPlots(data = QuantData, type = "ConditionPlot")
}
\keyword{internal}
diff --git a/man/dot-makeSummaryProfilePlot.Rd b/man/dot-makeSummaryProfilePlot.Rd
index 94a2f935..1811d9a7 100644
--- a/man/dot-makeSummaryProfilePlot.Rd
+++ b/man/dot-makeSummaryProfilePlot.Rd
@@ -26,16 +26,16 @@
\item{is_censored}{TRUE if censored values were imputed}
-\item{x.axis.size}{size of x-axis labeling for "Run" in Profile Plot and
+\item{x.axis.size}{size of x-axis labeling for "Run" in Profile Plot and
QC Plot, and "Condition" in Condition Plot. Default is 10.}
\item{y.axis.size}{size of y-axis labels. Default is 10.}
-\item{text.size}{size of labels represented each condition at the top of
+\item{text.size}{size of labels represented each condition at the top of
graph in Profile Plot and QC plot. Default is 4.}
\item{text.angle}{angle of labels represented each condition at the top
-of graph in Profile Plot and QC plot or x-axis labeling in Condition plot.
+of graph in Profile Plot and QC plot or x-axis labeling in Condition plot.
Default is 0.}
\item{legend.size}{size of feature legend (transition-level or peptide-level)
diff --git a/man/dot-normalizeGlobalStandards.Rd b/man/dot-normalizeGlobalStandards.Rd
index fc356f47..d3012239 100644
--- a/man/dot-normalizeGlobalStandards.Rd
+++ b/man/dot-normalizeGlobalStandards.Rd
@@ -9,10 +9,10 @@
\arguments{
\item{input}{data.table in MSstats format}
-\item{peptides_dict}{`data.table` of names of peptides and their corresponding
+\item{peptides_dict}{`data.table` of names of peptides and their corresponding
features.}
-\item{standards}{character vector with names of standards, required if
+\item{standards}{character vector with names of standards, required if
"GLOBALSTANDARDS" method was selected.}
}
\description{
diff --git a/man/dot-plotComparison.Rd b/man/dot-plotComparison.Rd
index 9e4683cd..fe8df435 100644
--- a/man/dot-plotComparison.Rd
+++ b/man/dot-plotComparison.Rd
@@ -46,8 +46,8 @@
\item{log_base_FC}{log base for log-fold changes - 2 or 10}
-\item{isPlotly}{This parameter is for MSstatsShiny application for plotly
-render, this cannot be used for saving PDF files as plotly do not have
+\item{isPlotly}{This parameter is for MSstatsShiny application for plotly
+render, this cannot be used for saving PDF files as plotly do not have
suppprt for PDFs currently. address and isPlotly cannot be set as TRUE at the
same time.}
}
diff --git a/man/dot-plotHeatmap.Rd b/man/dot-plotHeatmap.Rd
index cdc66ac7..9f9a6d9e 100644
--- a/man/dot-plotHeatmap.Rd
+++ b/man/dot-plotHeatmap.Rd
@@ -52,8 +52,8 @@ For Plotly: use this parameter to adjust the number of proteins to be displayed
\item{address}{the name of folder that will store the results. Default folder is the current working directory. The other assigned folder has to be existed under the current working directory. An output pdf file is automatically created with the default name of "VolcanoPlot.pdf" or "Heatmap.pdf" or "ComparisonPlot.pdf". The command address can help to specify where to store the file as well as how to modify the beginning of the file name. If address=FALSE, plot will be not saved as pdf file but showed in window.}
-\item{isPlotly}{This parameter is for MSstatsShiny application for plotly
-render, this cannot be used for saving PDF files as plotly do not have
+\item{isPlotly}{This parameter is for MSstatsShiny application for plotly
+render, this cannot be used for saving PDF files as plotly do not have
suppprt for PDFs currently. address and isPlotly cannot be set as TRUE at the
same time.}
}
diff --git a/man/dot-plotVolcano.Rd b/man/dot-plotVolcano.Rd
index 1e532688..ca5c5dc3 100644
--- a/man/dot-plotVolcano.Rd
+++ b/man/dot-plotVolcano.Rd
@@ -57,8 +57,8 @@
\item{y.axis.size}{size of axes labels, e.g. name of targeted proteins in heatmap. Default is 10.}
-\item{isPlotly}{This parameter is for MSstatsShiny application for plotly
-render, this cannot be used for saving PDF files as plotly do not have
+\item{isPlotly}{This parameter is for MSstatsShiny application for plotly
+render, this cannot be used for saving PDF files as plotly do not have
suppprt for PDFs currently. address and isPlotly cannot be set as TRUE at the
same time.}
}
diff --git a/man/dot-runTukey.Rd b/man/dot-runTukey.Rd
index 038d2911..d9d07d09 100644
--- a/man/dot-runTukey.Rd
+++ b/man/dot-runTukey.Rd
@@ -11,14 +11,14 @@
\item{is_labeled}{logical, if TRUE, data is coming from an SRM experiment}
-\item{censored_symbol}{Missing values are censored or at random.
-'NA' (default) assumes that all 'NA's in 'Intensity' column are censored.
-'0' uses zero intensities as censored intensity.
-In this case, NA intensities are missing at random.
-The output from Skyline should use '0'.
+\item{censored_symbol}{Missing values are censored or at random.
+'NA' (default) assumes that all 'NA's in 'Intensity' column are censored.
+'0' uses zero intensities as censored intensity.
+In this case, NA intensities are missing at random.
+The output from Skyline should use '0'.
Null assumes that all NA intensites are randomly missing.}
-\item{remove50missing}{only for summaryMethod = "TMP". TRUE removes the proteins
+\item{remove50missing}{only for summaryMethod = "TMP". TRUE removes the proteins
where every run has at least 50\% missing values for each peptide. FALSE is default.}
}
\value{
diff --git a/man/dot-setCensoredByThreshold.Rd b/man/dot-setCensoredByThreshold.Rd
index b3ca6e63..5efc6de0 100644
--- a/man/dot-setCensoredByThreshold.Rd
+++ b/man/dot-setCensoredByThreshold.Rd
@@ -3,7 +3,7 @@
\name{.setCensoredByThreshold}
\alias{.setCensoredByThreshold}
\title{Set censored values based on minimum in run/feature/run or feature.
-This is used to initialize the AFT imputation model by supplying the maximum
+This is used to initialize the AFT imputation model by supplying the maximum
possible values for left-censored data as the `time` input to the Surv function.}
\usage{
.setCensoredByThreshold(input, censored_symbol, remove50missing)
@@ -18,7 +18,7 @@ will be removed}
}
\description{
Set censored values based on minimum in run/feature/run or feature.
-This is used to initialize the AFT imputation model by supplying the maximum
+This is used to initialize the AFT imputation model by supplying the maximum
possible values for left-censored data as the `time` input to the Surv function.
}
\keyword{internal}
diff --git a/man/extractSDRF.Rd b/man/extractSDRF.Rd
index c8fd7c5a..b159eeb7 100644
--- a/man/extractSDRF.Rd
+++ b/man/extractSDRF.Rd
@@ -14,7 +14,7 @@ extractSDRF(
)
}
\arguments{
-\item{data}{MSstats formatted data that is the output of a dedicated
+\item{data}{MSstats formatted data that is the output of a dedicated
converter, such as `MaxQtoMSstatsFormat`, `SkylinetoMSstatsFormat`, ect.}
\item{run_name}{Run column name in SDRF data}
@@ -26,23 +26,26 @@ converter, such as `MaxQtoMSstatsFormat`, `SkylinetoMSstatsFormat`, ect.}
\item{fraction}{Fraction column name in SDRF data (if applicable). Default is
`NULL`. If there are no fractions keep `NULL`.}
-\item{meta_data}{A data.frame including any additional meta data for the SDRF
-file that is not included in MSstats. This meta data will be added into the
-final SDRF file. Please ensure the run names in the meta data matches the
+\item{meta_data}{A data.frame including any additional meta data for the SDRF
+file that is not included in MSstats. This meta data will be added into the
+final SDRF file. Please ensure the run names in the meta data matches the
run names in the MSstats data.}
}
\description{
Extract experimental design from MSstats format into SDRF format
}
\examples{
-mq_ev = data.table::fread(system.file("tinytest/raw_data/MaxQuant/mq_ev.csv",
- package = "MSstatsConvert"))
-mq_pg = data.table::fread(system.file("tinytest/raw_data/MaxQuant/mq_pg.csv",
- package = "MSstatsConvert"))
-annot = data.table::fread(system.file("tinytest/raw_data/MaxQuant/annotation.csv",
- package = "MSstatsConvert"))
-maxq_imported = MaxQtoMSstatsFormat(mq_ev, annot, mq_pg, use_log_file = FALSE)
+mq_ev <- data.table::fread(system.file("tinytest/raw_data/MaxQuant/mq_ev.csv",
+ package = "MSstatsConvert"
+))
+mq_pg <- data.table::fread(system.file("tinytest/raw_data/MaxQuant/mq_pg.csv",
+ package = "MSstatsConvert"
+))
+annot <- data.table::fread(system.file("tinytest/raw_data/MaxQuant/annotation.csv",
+ package = "MSstatsConvert"
+))
+maxq_imported <- MaxQtoMSstatsFormat(mq_ev, annot, mq_pg, use_log_file = FALSE)
head(maxq_imported)
-SDRF_file = extractSDRF(maxq_imported)
+SDRF_file <- extractSDRF(maxq_imported)
}
diff --git a/man/getProcessed.Rd b/man/getProcessed.Rd
index 0dd21210..750cebcc 100644
--- a/man/getProcessed.Rd
+++ b/man/getProcessed.Rd
@@ -16,21 +16,23 @@ data.table processed by dataProcess subfunctions
Get feature-level data to be used in the MSstatsSummarizationOutput function
}
\examples{
-raw = DDARawData
-method = "TMP"
-cens = "NA"
-impute = TRUE
+raw <- DDARawData
+method <- "TMP"
+cens <- "NA"
+impute <- TRUE
MSstatsConvert::MSstatsLogsSettings(FALSE)
-input = MSstatsPrepareForDataProcess(raw, 2, NULL)
-input = MSstatsNormalize(input, "EQUALIZEMEDIANS")
-input = MSstatsMergeFractions(input)
-input = MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
-input_all = MSstatsSelectFeatures(input, "all") # all features
-input_5 = MSstatsSelectFeatures(data.table::copy(input),
-"topN", top_n = 5) # top 5 features
+input <- MSstatsPrepareForDataProcess(raw, 2, NULL)
+input <- MSstatsNormalize(input, "EQUALIZEMEDIANS")
+input <- MSstatsMergeFractions(input)
+input <- MSstatsHandleMissing(input, "TMP", TRUE, "NA", 0.999)
+input_all <- MSstatsSelectFeatures(input, "all") # all features
+input_5 <- MSstatsSelectFeatures(data.table::copy(input),
+ "topN",
+ top_n = 5
+) # top 5 features
-proc1 = getProcessed(input_all)
-proc2 = getProcessed(input_5)
+proc1 <- getProcessed(input_all)
+proc2 <- getProcessed(input_5)
proc1
proc2
diff --git a/man/groupComparison.Rd b/man/groupComparison.Rd
index 40860d8c..33f499b9 100644
--- a/man/groupComparison.Rd
+++ b/man/groupComparison.Rd
@@ -35,13 +35,13 @@ to an existing log file.}
\item{verbose}{logical. If TRUE, information about data processing wil be printed
to the console.}
-\item{log_file_path}{character. Path to a file to which information about
-data processing will be saved.
+\item{log_file_path}{character. Path to a file to which information about
+data processing will be saved.
If not provided, such a file will be created automatically.
If `append = TRUE`, has to be a valid path to a file.}
-\item{numberOfCores}{Number of cores for parallel processing. When > 1,
-a logfile named `MSstats_groupComparison_log_progress.log` is created to
+\item{numberOfCores}{Number of cores for parallel processing. When > 1,
+a logfile named `MSstats_groupComparison_log_progress.log` is created to
track progress. Only works for Linux & Mac OS. Default is 1.}
}
\value{
@@ -96,23 +96,25 @@ The underlying model fitting functions are lm and lmer for the fixed effects mod
The input of this function is the quantitative data from function (dataProcess).
}
\examples{
-# Consider quantitative data (i.e. QuantData) from yeast study with ten time points of interests,
-# three biological replicates, and no technical replicates.
+# Consider quantitative data (i.e. QuantData) from yeast study with ten time points of interests,
+# three biological replicates, and no technical replicates.
# It is a time-course experiment and we attempt to compare differential abundance
-# between time 1 and 7 in a set of targeted proteins.
-# In this label-based SRM experiment, MSstats uses the fitted model with expanded scope of
-# Biological replication.
+# between time 1 and 7 in a set of targeted proteins.
+# In this label-based SRM experiment, MSstats uses the fitted model with expanded scope of
+# Biological replication.
QuantData <- dataProcess(SRMRawData, use_log_file = FALSE)
head(QuantData$FeatureLevelData)
levels(QuantData$ProteinLevelData$GROUP)
-comparison <- matrix(c(-1,0,0,0,0,0,1,0,0,0),nrow=1)
+comparison <- matrix(c(-1, 0, 0, 0, 0, 0, 1, 0, 0, 0), nrow = 1)
row.names(comparison) <- "T7-T1"
-groups = levels(QuantData$ProteinLevelData$GROUP)
+groups <- levels(QuantData$ProteinLevelData$GROUP)
colnames(comparison) <- groups[order(as.numeric(groups))]
# Tests for differentially abundant proteins with models:
# label-based SRM experiment with expanded scope of biological replication.
-testResultOneComparison <- groupComparison(contrast.matrix=comparison, data=QuantData,
- use_log_file = FALSE)
+testResultOneComparison <- groupComparison(
+ contrast.matrix = comparison, data = QuantData,
+ use_log_file = FALSE
+)
# table for result
testResultOneComparison$ComparisonResult
diff --git a/man/groupComparisonPlots.Rd b/man/groupComparisonPlots.Rd
index 6b621ded..97ee951e 100644
--- a/man/groupComparisonPlots.Rd
+++ b/man/groupComparisonPlots.Rd
@@ -79,17 +79,17 @@ For Plotly: use this parameter to adjust the number of proteins to be displayed
\item{address}{the name of folder that will store the results. Default folder is the current working directory. The other assigned folder has to be existed under the current working directory. An output pdf file is automatically created with the default name of "VolcanoPlot.pdf" or "Heatmap.pdf" or "ComparisonPlot.pdf". The command address can help to specify where to store the file as well as how to modify the beginning of the file name. If address=FALSE, plot will be not saved as pdf file but showed in window.}
-\item{isPlotly}{This parameter is for MSstatsShiny application for plotly
-render, this cannot be used for saving PDF files as plotly do not have
+\item{isPlotly}{This parameter is for MSstatsShiny application for plotly
+render, this cannot be used for saving PDF files as plotly do not have
suppprt for PDFs currently. address and isPlotly cannot be set as TRUE at the
same time.}
}
\description{
-To summarize the results of log-fold changes and adjusted p-values for differentially abundant proteins,
-groupComparisonPlots takes testing results from function (\code{\link{groupComparison}}) as input and
-automatically generate three types of figures in pdf files as output :
-(1) volcano plot (specify "VolcanoPlot" in option type) for each comparison separately;
-(2) heatmap (specify "Heatmap" in option type) for multiple comparisons ;
+To summarize the results of log-fold changes and adjusted p-values for differentially abundant proteins,
+groupComparisonPlots takes testing results from function (\code{\link{groupComparison}}) as input and
+automatically generate three types of figures in pdf files as output :
+(1) volcano plot (specify "VolcanoPlot" in option type) for each comparison separately;
+(2) heatmap (specify "Heatmap" in option type) for multiple comparisons ;
(3) comparison plot (specify "ComparisonPlot" in option type) for multiple comparisons per protein.
}
\details{
@@ -100,40 +100,54 @@ automatically generate three types of figures in pdf files as output :
}
}
\examples{
-QuantData<-dataProcess(SRMRawData, use_log_file = FALSE)
+QuantData <- dataProcess(SRMRawData, use_log_file = FALSE)
head(QuantData$FeatureLevelData)
## based on multiple comparisons (T1 vs T3; T1 vs T7; T1 vs T9)
-comparison1<-matrix(c(-1,0,1,0,0,0,0,0,0,0),nrow=1)
-comparison2<-matrix(c(-1,0,0,0,0,0,1,0,0,0),nrow=1)
-comparison3<-matrix(c(-1,0,0,0,0,0,0,0,1,0),nrow=1)
-comparison<-rbind(comparison1,comparison2, comparison3)
-row.names(comparison)<-c("T3-T1","T7-T1","T9-T1")
-groups = levels(QuantData$ProteinLevelData$GROUP)
+comparison1 <- matrix(c(-1, 0, 1, 0, 0, 0, 0, 0, 0, 0), nrow = 1)
+comparison2 <- matrix(c(-1, 0, 0, 0, 0, 0, 1, 0, 0, 0), nrow = 1)
+comparison3 <- matrix(c(-1, 0, 0, 0, 0, 0, 0, 0, 1, 0), nrow = 1)
+comparison <- rbind(comparison1, comparison2, comparison3)
+row.names(comparison) <- c("T3-T1", "T7-T1", "T9-T1")
+groups <- levels(QuantData$ProteinLevelData$GROUP)
colnames(comparison) <- groups[order(as.numeric(groups))]
-testResultMultiComparisons<-groupComparison(contrast.matrix=comparison,
-data=QuantData,
-use_log_file = FALSE)
+testResultMultiComparisons <- groupComparison(
+ contrast.matrix = comparison,
+ data = QuantData,
+ use_log_file = FALSE
+)
testResultMultiComparisons$ComparisonResult
# Volcano plot with FDR cutoff = 0.05 and no FC cutoff
-groupComparisonPlots(data=testResultMultiComparisons$ComparisonResult, type="VolcanoPlot",
-logBase.pvalue=2, address="Ex1_")
-# Volcano plot with FDR cutoff = 0.05, FC cutoff = 70, upper y-axis limit = 100,
+groupComparisonPlots(
+ data = testResultMultiComparisons$ComparisonResult, type = "VolcanoPlot",
+ logBase.pvalue = 2, address = "Ex1_"
+)
+# Volcano plot with FDR cutoff = 0.05, FC cutoff = 70, upper y-axis limit = 100,
# and no protein name displayed
# FCcutoff=70 is for demonstration purpose
-groupComparisonPlots(data=testResultMultiComparisons$ComparisonResult, type="VolcanoPlot",
-FCcutoff=70, logBase.pvalue=2, ylimUp=100, ProteinName=FALSE,address="Ex2_")
+groupComparisonPlots(
+ data = testResultMultiComparisons$ComparisonResult, type = "VolcanoPlot",
+ FCcutoff = 70, logBase.pvalue = 2, ylimUp = 100, ProteinName = FALSE, address = "Ex2_"
+)
# Heatmap with FDR cutoff = 0.05
-groupComparisonPlots(data=testResultMultiComparisons$ComparisonResult, type="Heatmap",
-logBase.pvalue=2, address="Ex1_")
+groupComparisonPlots(
+ data = testResultMultiComparisons$ComparisonResult, type = "Heatmap",
+ logBase.pvalue = 2, address = "Ex1_"
+)
# Heatmap with FDR cutoff = 0.05 and FC cutoff = 70
# FCcutoff=70 is for demonstration purpose
-groupComparisonPlots(data=testResultMultiComparisons$ComparisonResult, type="Heatmap",
-FCcutoff=70, logBase.pvalue=2, address="Ex2_")
+groupComparisonPlots(
+ data = testResultMultiComparisons$ComparisonResult, type = "Heatmap",
+ FCcutoff = 70, logBase.pvalue = 2, address = "Ex2_"
+)
# Comparison Plot
-groupComparisonPlots(data=testResultMultiComparisons$ComparisonResult, type="ComparisonPlot",
-address="Ex1_")
+groupComparisonPlots(
+ data = testResultMultiComparisons$ComparisonResult, type = "ComparisonPlot",
+ address = "Ex1_"
+)
# Comparison Plot
-groupComparisonPlots(data=testResultMultiComparisons$ComparisonResult, type="ComparisonPlot",
-ylimUp=8, ylimDown=-1, address="Ex2_")
+groupComparisonPlots(
+ data = testResultMultiComparisons$ComparisonResult, type = "ComparisonPlot",
+ ylimUp = 8, ylimDown = -1, address = "Ex2_"
+)
}
diff --git a/man/groupComparisonQCPlots.Rd b/man/groupComparisonQCPlots.Rd
index 66ebd1b0..2a08b602 100644
--- a/man/groupComparisonQCPlots.Rd
+++ b/man/groupComparisonQCPlots.Rd
@@ -18,7 +18,7 @@ groupComparisonQCPlots(
\arguments{
\item{data}{output from function groupComparison.}
-\item{type}{choice of visualization. "QQPlots" represents normal quantile-quantile
+\item{type}{choice of visualization. "QQPlots" represents normal quantile-quantile
plot for each protein after fitting models. "ResidualPlots" represents a plot
of residuals versus fitted values for each protein in the dataset.}
@@ -30,8 +30,8 @@ of residuals versus fitted values for each protein in the dataset.}
\item{height}{height of the saved file. Default is 10.}
-\item{which.Protein}{Protein list to draw plots. List can be names of Proteins
-or order numbers of Proteins from levels(testResultOneComparison$ComparisonResult$Protein).
+\item{which.Protein}{Protein list to draw plots. List can be names of Proteins
+or order numbers of Proteins from levels(testResultOneComparison$ComparisonResult$Protein).
Default is "all", which generates all plots for each protein.}
\item{address}{name that will serve as a prefix to the name of output file.}
@@ -40,18 +40,18 @@ Default is "all", which generates all plots for each protein.}
produce a pdf file
}
\description{
-To check the assumption of linear model for whole plot inference,
-groupComparisonQCPlots takes the results after fitting models from function
-(\code{\link{groupComparison}}) as input and automatically generate two types
-of figures in pdf files as output:
+To check the assumption of linear model for whole plot inference,
+groupComparisonQCPlots takes the results after fitting models from function
+(\code{\link{groupComparison}}) as input and automatically generate two types
+of figures in pdf files as output:
(1) normal quantile-quantile plot (specify "QQPlot" in option type) for checking
-normally distributed errors.;
+normally distributed errors.;
(2) residual plot (specify "ResidualPlot" in option type).
}
\details{
-Results based on statistical models for whole plot level inference are
-accurate as long as the assumptions of the model are met. The model assumes that
-the measurement errors are normally distributed with mean 0 and constant variance.
+Results based on statistical models for whole plot level inference are
+accurate as long as the assumptions of the model are met. The model assumes that
+the measurement errors are normally distributed with mean 0 and constant variance.
The assumption of a constant variance can be checked by examining the residuals from the model.
\itemize{
\item{QQPlots : a normal quantile-quantile plot for each protein is generated in order to check whether the errors are well approximated by a normal distribution. If points fall approximately along a straight line, then the assumption is appropriate for that protein. Only large deviations from the line are problematic.}
@@ -62,16 +62,18 @@ The assumption of a constant variance can be checked by examining the residuals
QuantData <- dataProcess(SRMRawData, use_log_file = FALSE)
head(QuantData$FeatureLevelData)
levels(QuantData$FeatureLevelData$GROUP)
-comparison <- matrix(c(-1,0,0,0,0,0,1,0,0,0),nrow=1)
+comparison <- matrix(c(-1, 0, 0, 0, 0, 0, 1, 0, 0, 0), nrow = 1)
row.names(comparison) <- "T7-T1"
colnames(comparison) <- unique(QuantData$ProteinLevelData$GROUP)
# Tests for differentially abundant proteins with models:
# label-based SRM experiment with expanded scope of biological replication.
-testResultOneComparison <- groupComparison(contrast.matrix=comparison, data=QuantData,
-use_log_file = FALSE)
+testResultOneComparison <- groupComparison(
+ contrast.matrix = comparison, data = QuantData,
+ use_log_file = FALSE
+)
# normal quantile-quantile plots
-groupComparisonQCPlots(data=testResultOneComparison, type="QQPlots", address="")
+groupComparisonQCPlots(data = testResultOneComparison, type = "QQPlots", address = "")
# residual plots
-groupComparisonQCPlots(data=testResultOneComparison, type="ResidualPlots", address="")
+groupComparisonQCPlots(data = testResultOneComparison, type = "ResidualPlots", address = "")
}
diff --git a/man/makePeptidesDictionary.Rd b/man/makePeptidesDictionary.Rd
index 0a5131fd..5e34b314 100644
--- a/man/makePeptidesDictionary.Rd
+++ b/man/makePeptidesDictionary.Rd
@@ -20,8 +20,8 @@ with global standards. It is useful for running the summarization workflow
outside of the dataProcess function.
}
\examples{
-input = data.table::as.data.table(DDARawData)
-peptides_dict = makePeptidesDictionary(input, "GLOBALSTANDARDS")
+input <- data.table::as.data.table(DDARawData)
+peptides_dict <- makePeptidesDictionary(input, "GLOBALSTANDARDS")
head(peptides_dict) # ready to be passed to the MSstatsNormalize function
}
diff --git a/man/modelBasedQCPlots.Rd b/man/modelBasedQCPlots.Rd
index 8538ab97..b50bcd47 100644
--- a/man/modelBasedQCPlots.Rd
+++ b/man/modelBasedQCPlots.Rd
@@ -19,7 +19,7 @@ modelBasedQCPlots(
\arguments{
\item{data}{output from function groupComparison.}
-\item{type}{choice of visualization. "QQPlots" represents normal quantile-quantile
+\item{type}{choice of visualization. "QQPlots" represents normal quantile-quantile
plot for each protein after fitting models. "ResidualPlots" represents a plot
of residuals versus fitted values for each protein in the dataset.}
@@ -31,8 +31,8 @@ of residuals versus fitted values for each protein in the dataset.}
\item{height}{height of the saved file. Default is 10.}
-\item{which.Protein}{Protein list to draw plots. List can be names of Proteins
-or order numbers of Proteins from levels(testResultOneComparison$ComparisonResult$Protein).
+\item{which.Protein}{Protein list to draw plots. List can be names of Proteins
+or order numbers of Proteins from levels(testResultOneComparison$ComparisonResult$Protein).
Default is "all", which generates all plots for each protein.}
\item{address}{name that will serve as a prefix to the name of output file.}
@@ -41,18 +41,18 @@ Default is "all", which generates all plots for each protein.}
produce a pdf file
}
\description{
-To check the assumption of linear model for whole plot inference,
-modelBasedQCPlots takes the results after fitting models from function
-(\code{\link{groupComparison}}) as input and automatically generate two types
-of figures in pdf files as output:
+To check the assumption of linear model for whole plot inference,
+modelBasedQCPlots takes the results after fitting models from function
+(\code{\link{groupComparison}}) as input and automatically generate two types
+of figures in pdf files as output:
(1) normal quantile-quantile plot (specify "QQPlot" in option type) for checking
-normally distributed errors.;
+normally distributed errors.;
(2) residual plot (specify "ResidualPlot" in option type).
}
\details{
-Results based on statistical models for whole plot level inference are
-accurate as long as the assumptions of the model are met. The model assumes that
-the measurement errors are normally distributed with mean 0 and constant variance.
+Results based on statistical models for whole plot level inference are
+accurate as long as the assumptions of the model are met. The model assumes that
+the measurement errors are normally distributed with mean 0 and constant variance.
The assumption of a constant variance can be checked by examining the residuals from the model.
\itemize{
\item{QQPlots : a normal quantile-quantile plot for each protein is generated in order to check whether the errors are well approximated by a normal distribution. If points fall approximately along a straight line, then the assumption is appropriate for that protein. Only large deviations from the line are problematic.}
@@ -63,16 +63,18 @@ The assumption of a constant variance can be checked by examining the residuals
QuantData <- dataProcess(SRMRawData, use_log_file = FALSE)
head(QuantData$FeatureLevelData)
levels(QuantData$FeatureLevelData$GROUP)
-comparison <- matrix(c(-1,0,0,0,0,0,1,0,0,0),nrow=1)
+comparison <- matrix(c(-1, 0, 0, 0, 0, 0, 1, 0, 0, 0), nrow = 1)
row.names(comparison) <- "T7-T1"
colnames(comparison) <- unique(QuantData$ProteinLevelData$GROUP)
# Tests for differentially abundant proteins with models:
# label-based SRM experiment with expanded scope of biological replication.
-testResultOneComparison <- groupComparison(contrast.matrix=comparison, data=QuantData,
-use_log_file = FALSE)
+testResultOneComparison <- groupComparison(
+ contrast.matrix = comparison, data = QuantData,
+ use_log_file = FALSE
+)
# normal quantile-quantile plots
-modelBasedQCPlots(data=testResultOneComparison, type="QQPlots", address="")
+modelBasedQCPlots(data = testResultOneComparison, type = "QQPlots", address = "")
# residual plots
-modelBasedQCPlots(data=testResultOneComparison, type="ResidualPlots", address="")
+modelBasedQCPlots(data = testResultOneComparison, type = "ResidualPlots", address = "")
}
diff --git a/man/quantification.Rd b/man/quantification.Rd
index 110c5331..d83cfab1 100644
--- a/man/quantification.Rd
+++ b/man/quantification.Rd
@@ -38,8 +38,8 @@ to an existing log file.}
\item{verbose}{logical. If TRUE, information about data processing wil be printed
to the console.}
-\item{log_file_path}{character. Path to a file to which information about
-data processing will be saved.
+\item{log_file_path}{character. Path to a file to which information about
+data processing will be saved.
If not provided, such a file will be created automatically.
If `append = TRUE`, has to be a valid path to a file.}
}
@@ -68,13 +68,13 @@ results (data.frame) in a long or matrix format.
# Sample quantification shows model-based estimation of protein abundance in each biological
# replicate within each time point.
# Group quantification shows model-based estimation of protein abundance in each time point.
-QuantData<-dataProcess(SRMRawData, use_log_file = FALSE)
+QuantData <- dataProcess(SRMRawData, use_log_file = FALSE)
head(QuantData$FeatureLevelData)
# Sample quantification
-sampleQuant<-quantification(QuantData, use_log_file = FALSE)
+sampleQuant <- quantification(QuantData, use_log_file = FALSE)
head(sampleQuant)
# Group quantification
-groupQuant<-quantification(QuantData, type="Group", use_log_file = FALSE)
+groupQuant <- quantification(QuantData, type = "Group", use_log_file = FALSE)
head(groupQuant)
}
diff --git a/tests/tinytest.R b/tests/tinytest.R
index b9681980..1bd81213 100644
--- a/tests/tinytest.R
+++ b/tests/tinytest.R
@@ -1,3 +1,3 @@
-if ( requireNamespace("tinytest", quietly=TRUE) ){
+if (requireNamespace("tinytest", quietly = TRUE)) {
tinytest::test_package("MSstats")
}
diff --git a/vignettes/MSstats.Rmd b/vignettes/MSstats.Rmd
index 9ba8c8b9..a89ca4ee 100644
--- a/vignettes/MSstats.Rmd
+++ b/vignettes/MSstats.Rmd
@@ -4,8 +4,8 @@ BiocStyle::markdown()
```
```{r global_options, include=FALSE}
-knitr::opts_chunk$set(fig.width=10, fig.height=7, warning=FALSE, message=FALSE)
-options(width=110)
+knitr::opts_chunk$set(fig.width = 10, fig.height = 7, warning = FALSE, message = FALSE)
+options(width = 110)
```