diff --git a/.gitignore b/.gitignore index 8db067bcb..bd1be61b0 100644 --- a/.gitignore +++ b/.gitignore @@ -10,8 +10,3 @@ docs/ .DS_Store mia.BiocCheck/ -src/*.dll -inst/extdata/taxonomic_profiles_mgx.tsv -inst/extdata/ecs_relab.tsv -inst/extdata/hmp2_metadata_2018-08-20.csv -tools/cache/ \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 0d9638799..38bceae30 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: mia Type: Package -Version: 1.19.5 +Version: 1.19.6 Authors@R: c(person(given = "Tuomas", family = "Borman", role = c("aut", "cre"), email = "tuomas.v.borman@utu.fi", diff --git a/NAMESPACE b/NAMESPACE index d28a53b3f..78b47e87a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -13,6 +13,7 @@ export(addDissimilarity) export(addDivergence) export(addDominant) export(addHierarchyTree) +export(addJointRPCA) export(addLDA) export(addMDS) export(addMediation) @@ -24,6 +25,7 @@ export(addPerSampleDominantTaxa) export(addPrevalence) export(addPrevalentAbundance) export(addRDA) +export(addRPCA) export(addTaxonomyTree) export(agglomerateByModule) export(agglomerateByPrevalence) @@ -85,6 +87,7 @@ export(getPrevalentAbundance) export(getPrevalentFeatures) export(getPrevalentTaxa) export(getRDA) +export(getRPCA) export(getRare) export(getRareFeatures) export(getRareTaxa) @@ -183,6 +186,7 @@ exportMethods(addDissimilarity) exportMethods(addDivergence) exportMethods(addDominant) exportMethods(addHierarchyTree) +exportMethods(addJointRPCA) exportMethods(addLDA) exportMethods(addMDS) exportMethods(addMediation) @@ -194,6 +198,7 @@ exportMethods(addPerSampleDominantTaxa) exportMethods(addPrevalence) exportMethods(addPrevalentAbundance) exportMethods(addRDA) +exportMethods(addRPCA) exportMethods(addTaxonomyTree) exportMethods(agglomerateByModule) exportMethods(agglomerateByPrevalence) @@ -235,6 +240,7 @@ exportMethods(getDominant) exportMethods(getExperimentCrossAssociation) exportMethods(getExperimentCrossCorrelation) exportMethods(getHierarchyTree) +exportMethods(getJointRPCA) exportMethods(getLDA) exportMethods(getLowAbundant) exportMethods(getMDS) @@ -249,6 +255,7 @@ exportMethods(getPrevalentAbundance) exportMethods(getPrevalentFeatures) exportMethods(getPrevalentTaxa) exportMethods(getRDA) +exportMethods(getRPCA) exportMethods(getRare) exportMethods(getRareFeatures) exportMethods(getRareTaxa) @@ -358,6 +365,7 @@ importFrom(MatrixGenerics,rowSums2) importFrom(MultiAssayExperiment,ExperimentList) importFrom(MultiAssayExperiment,MultiAssayExperiment) importFrom(MultiAssayExperiment,experiments) +importFrom(MultiAssayExperiment,intersectColumns) importFrom(MultiAssayExperiment,sampleMap) importFrom(Rcpp,evalCpp) importFrom(Rcpp,sourceCpp) @@ -435,6 +443,7 @@ importFrom(stats,chisq.test) importFrom(stats,cmdscale) importFrom(stats,cor) importFrom(stats,cor.test) +importFrom(stats,dist) importFrom(stats,formula) importFrom(stats,gaussian) importFrom(stats,glm) diff --git a/R/AllGenerics.R b/R/AllGenerics.R index 5a12bb524..241789a42 100644 --- a/R/AllGenerics.R +++ b/R/AllGenerics.R @@ -410,3 +410,23 @@ setGeneric("addMDS", signature = "x", function(x, ...) #' @export setGeneric("getReducedDimAttribute", signature = "x", function(x, ...) standardGeneric("getReducedDimAttribute")) + +#' @rdname getRPCA +#' @export +setGeneric("getRPCA", signature = "x", function(x, ...) + standardGeneric("getRPCA")) + +#' @rdname getRPCA +#' @export +setGeneric("addRPCA", signature = "x", function(x, ...) + standardGeneric("addRPCA")) + +#' @rdname getRPCA +#' @export +setGeneric("getJointRPCA", signature = "x", function(x, ...) + standardGeneric("getJointRPCA")) + +#' @rdname getRPCA +#' @export +setGeneric("addJointRPCA", signature = "x", function(x, ...) + standardGeneric("addJointRPCA")) diff --git a/R/cluster.R b/R/cluster.R index 2038932bd..0952ce900 100644 --- a/R/cluster.R +++ b/R/cluster.R @@ -104,7 +104,7 @@ setMethod("addCluster", signature = c(x = "SummarizedExperiment"), # result <- getCluster( x = x, BLUSPARAM = BLUSPARAM, assay.type = assay.type, - dimred = dimred, by = by, full = full, ...) + by = by, full = full, ...) # If user has specified full=TRUE, result includes additional info # that will be stored to metadata. if( full ){ diff --git a/R/getJointRPCA.R b/R/getJointRPCA.R deleted file mode 100644 index f8d8b6f55..000000000 --- a/R/getJointRPCA.R +++ /dev/null @@ -1,1191 +0,0 @@ -## Joint RPCA front-end helpers -## -## Exported: -## - jointRPCAuniversal() -## - getJointRPCA() -## - -#' Run Joint-RPCA and store embedding in reducedDim -#' @name getJointRPCA -#' -#' @details -#' Convenience wrapper that runs Joint Robust PCA on one or more compositional -#' tables and stores the resulting sample embedding in \code{reducedDim(x, name)}, -#' similar to \code{runMDS()} and \code{runPCA()}. -#' -#' @param x A \code{SummarizedExperiment}, \code{TreeSummarizedExperiment}, -#' \code{MultiAssayExperiment}, or a related object supported by -#' \code{jointRPCAuniversal()}. -#' @param experiments Optional character vector of experiment names to use when -#' \code{x} is a \code{MultiAssayExperiment} (i.e. \code{names(experiments(x))}). -#' Ignored for \code{SummarizedExperiment} inputs. -#' @param altexp Optional name of an alternative experiment. If supplied, -#' Joint-RPCA is run on \code{altExp(x, altexp)} instead of \code{x}. -#' @param name Character scalar giving the name of the \code{reducedDim} slot -#' in which to store the joint sample embedding. Defaults to \code{"JointRPCA"}. -#' @param transform Character string specifying preprocessing applied to each -#' input table before ordination. Use \code{"rclr"} to apply the robust CLR -#' transform (via \code{decostand(method = "rclr")}) or \code{"none"} to -#' disable transformation (data are used as-is after masking non-finite values). -#' @param optspace.tol Numeric tolerance passed to \code{optspace()}. -#' @param center Logical; whether to center the reconstructed low-rank matrix -#' (double-centering) prior to SVD/PCA steps. -#' @param scale Logical; whether to scale the reconstructed matrix prior to -#' SVD/PCA steps. Defaults to \code{FALSE}. -#' @param ... Additional arguments passed to \code{jointRPCAuniversal()} and then -#' to the internal \code{.joint_rpca()} engine (e.g. \code{n.components}, -#' \code{min.sample.count}, \code{min.feature.count}, \code{min.feature.frequency}, -#' \code{max.iterations}, \code{sample.metadata}). -#' -#' @return The input object \code{x} with a new entry in -#' \code{reducedDim(x, name)} containing the Joint-RPCA sample embedding. -#' The full Joint-RPCA result (including distances, cross-validation -#' statistics and transformed tables) is stored in -#' \code{metadata(x)$JointRPCA[[name]]}. -#' -#' @export -NULL - -getJointRPCA <- function(x, - experiments = NULL, - altexp = NULL, - name = "JointRPCA", - transform = c("rclr", "none"), - optspace.tol = 1e-5, - center = TRUE, - scale = FALSE, - ...) { - transform <- match.arg(transform) - - # Select the object to operate on - y <- x - if (!is.null(altexp)) { - y <- altExp(x, altexp) - } - - # Use universal front-end to build tables + run .joint_rpca() - res <- jointRPCAuniversal( - y, - experiments = experiments, - transform = transform, - optspace.tol = optspace.tol, - center = center, - scale = scale, - ... - ) - - # Extract sample embedding - emb <- res[["ord_res"]][["samples"]] - - if (is.null(emb)) { - stop( - "Internal error: JointRPCA did not return a sample embedding. ", - "Please report this and include sessionInfo().", - call. = FALSE - ) - } - - emb <- as.matrix(emb) - - # Ensure embedding rownames match colnames of the target object (Bioconductor requirement) - target_cols <- colnames(x) - if (!is.null(altexp)) { - target_cols <- colnames(x) - } - - if (is.null(target_cols)) { - stop("Cannot store reducedDim: 'x' has no colnames().", call. = FALSE) - } - - if (is.null(rownames(emb))) { - stop("Cannot store reducedDim: embedding has no rownames().", call. = FALSE) - } - - # Require the same set of samples/cells - if (!setequal(rownames(emb), target_cols)) { - missing_in_emb <- setdiff(target_cols, rownames(emb)) - extra_in_emb <- setdiff(rownames(emb), target_cols) - stop( - "Cannot store reducedDim: embedding rownames do not match colnames(x).\n", - "Missing in embedding: ", paste(missing_in_emb, collapse = ", "), "\n", - "Extra in embedding: ", paste(extra_in_emb, collapse = ", "), - call. = FALSE - ) - } - - # Reorder embedding to exactly match colnames(x) - emb <- emb[target_cols, , drop = FALSE] - rownames(emb) <- target_cols - - if (nrow(emb) == 0L || ncol(emb) == 0L || is.null(rownames(emb))) { - stop( - "Internal error: JointRPCA returned an invalid sample embedding. ", - "Please report this and include sessionInfo().", - call. = FALSE - ) - } - - # Store embedding in reducedDim only if supported (SCE / TreeSE / mia-specific) - cls <- class(x) - if (any(cls %in% c("SingleCellExperiment", "TreeSummarizedExperiment"))) { - reducedDim(x, name) <- emb - } - - # Store full result in metadata - if (is.null(metadata(x)$JointRPCA)) { - metadata(x)$JointRPCA <- list() - } - metadata(x)$JointRPCA[[name]] <- res - - return(x) -} - -#' Universal Joint RPCA Wrapper -#' @name getJointRPCA -#' @param x Input object: \code{MultiAssayExperiment}, \code{SummarizedExperiment} -#' (including \code{TreeSummarizedExperiment}), list of matrices, or single matrix. -#' @param experiments Character vector of experiment names to extract when \code{x} -#' is a \code{MultiAssayExperiment} (i.e. \code{names(experiments(x))}). -#' If \code{NULL}, all experiments are used. -#' -#' For \code{MultiAssayExperiment} inputs, \strong{one assay per experiment} is -#' used: by default the first assay returned by -#' \code{assayNames()} (or index \code{1L} if unnamed). -#' The actually used assay names are recorded in \code{$assay_names_used} in -#' the result. If you need a different assay (e.g. \code{"relab"} instead of -#' \code{"counts"}), subset or reorder assays in \code{x} before calling -#' \code{jointRPCAuniversal()}. -#' @param transform Character string specifying preprocessing applied to each -#' input table before ordination. Use \code{"rclr"} to apply the robust CLR -#' transform (via \code{decostand(method = "rclr")}) or \code{"none"} to -#' disable transformation (data are used as-is after masking non-finite values). -#' @param optspace.tol Numeric tolerance passed to \code{optspace()}. -#' @param center Logical; whether to center the reconstructed low-rank matrix -#' (double-centering) prior to SVD/PCA steps. -#' @param scale Logical; whether to scale the reconstructed matrix prior to -#' SVD/PCA steps. Defaults to \code{FALSE}. -#' @param ... Additional arguments passed to \code{.joint_rpca()}. -#' -#' @return Output from \code{.joint_rpca()} with extra fields when \code{x} is a -#' \code{MultiAssayExperiment}: -#' \itemize{ -#' \item \code{$experiment_names}: character vector of experiments used. -#' \item \code{$assay_names_used}: named character vector giving, for each -#' experiment, the assay name that was used (typically the first in -#' \code{assayNames()}). -#' } -#' @importFrom SummarizedExperiment assayNames -#' @importFrom SummarizedExperiment assay -#' @importFrom MultiAssayExperiment experiments -#' @importFrom vegan decostand -#' @importFrom vegan optspace -#' @export -NULL - -jointRPCAuniversal <- function(x, experiments = NULL, - transform = c("rclr", "none"), - optspace.tol = 1e-5, - center = TRUE, - scale = FALSE, - ...) { - transform <- match.arg(transform) - - assay_names_used <- NULL - - if (inherits(x, "MultiAssayExperiment")) { - mae <- .extract_mae_tables(x, experiments) - tables <- mae$tables - experiments <- mae$experiments - assay_names_used <- mae$assay_names_used - } else if (inherits(x, "SummarizedExperiment")) { - tables <- list(assay(x)) - anm <- assayNames(x) - nm <- if (length(anm) && !is.na(anm[1])) anm[1] else "assay1" - names(tables) <- nm - } else if (is.list(x) && all(vapply(x, is.matrix, logical(1)))) { - tables <- x - if (is.null(names(tables))) { - names(tables) <- paste0("view", seq_along(tables)) - } - } else if (is.matrix(x)) { - tables <- list(x) - names(tables) <- "assay1" - } else { - stop( - "Unsupported input type for jointRPCAuniversal(): ", - paste(class(x), collapse = ", "), - call. = FALSE - ) - } - - res <- .joint_rpca( - tables = tables, - transform = transform, - optspace.tol = optspace.tol, - center = center, - scale = scale, - ... - ) - - if (inherits(x, "MultiAssayExperiment")) { - res$experiment_names <- experiments - res$assay_names_used <- assay_names_used - } - - return(res) -} - -#' Joint Robust PCA on Multiple Compositional Tables -#' -#' Internal engine for Joint Robust Principal Component Analysis (RPCA) using -#' OptSpace on multiple compositional tables. -#' -#' This function assumes a list of already extracted tables and is typically -#' called via \code{jointRPCAuniversal()} or \code{getJointRPCA()}. -#' -#' @param tables A list of compositional data tables (matrices or data frames). -#' @param n.test.samples Integer specifying the number of samples to hold out for testing -#' (only used if \code{sample.metadata} is \code{NULL}). Default is 10. -#' @param sample.metadata Optional data frame containing sample-level metadata. -#' @param train.test.column The name of the column in \code{sample.metadata} -#' that defines training vs test samples. -#' @param n.components Integer specifying the number of principal components to compute. -#' @param transform Character string specifying preprocessing applied to each -#' input table before ordination: \code{"rclr"} or \code{"none"}. -#' @param optspace.tol Numeric tolerance passed to \code{vegan::optspace()}. -#' @param center,scale Logical; whether to center/scale the reconstructed matrix -#' prior to SVD/PCA steps. -#' @param min.sample.count Minimum total count required for a sample to be retained. -#' @param min.feature.count Minimum total count required for a feature to be retained. -#' @param min.feature.frequency Minimum percentage (0–100) of samples in which a -#' feature must be non-zero to be retained. -#' @param max.iterations Maximum number of optimization iterations. -#' -#' @return A list with \code{ord_res}, \code{dist}, \code{cv_stats}, and -#' \code{rclr_tables}. -#' -#' @keywords internal -#' @noRd - -.joint_rpca <- function(tables, - n.test.samples = 10, - sample.metadata = NULL, - train.test.column = NULL, - n.components = 3, - transform = c("rclr", "none"), - min.sample.count = 0, - min.feature.count = 0, - min.feature.frequency = 0, - max.iterations = 5, - optspace.tol = 1e-5, - center = TRUE, - scale = FALSE) { - transform <- match.arg(transform) - - if (is.null(names(tables))) { - names(tables) <- paste0("view", seq_along(tables)) - } - - if (n.components < 2) { - stop("n.components must be at least 2.", call. = FALSE) - } - if (max.iterations < 1) { - stop("max.iterations must be at least 1.", call. = FALSE) - } - - # Filtering (always done, independent of rclr) - tables <- lapply(tables, function(tbl) { - out <- .rpca_table_processing( - tbl, - min.sample.count = min.sample.count, - min.feature.count = min.feature.count, - min.feature.frequency = min.feature.frequency - ) - if (nrow(out) == 0L || ncol(out) == 0L) { - stop( - "Filtering removed all data in at least one table (0 features or 0 samples). ", - "Try relaxing filtering thresholds (min.sample.count / min.feature.count / ", - "min.feature.frequency) or check your input.", - call. = FALSE - ) - } - out - }) - - # Find shared samples across views - sample.sets <- lapply(tables, colnames) - shared.all.samples <- Reduce(intersect, sample.sets) - if (length(shared.all.samples) == 0L) { - stop( - "No samples overlap between all tables. ", - "Check that colnames are consistent across views, or", - "if you are using pre-transformed tables set transform = 'none'.", - call. = FALSE - ) - } - if (length(shared.all.samples) < (n.components + 1L)) { - stop( - "Too few shared samples across all tables after filtering (", - length(shared.all.samples), "). Need at least n.components + 1 shared samples. ", - "Try lowering n.components or relaxing filtering thresholds.", - call. = FALSE - ) - } - unshared.samples <- setdiff(unique(unlist(sample.sets)), shared.all.samples) - if (length(unshared.samples) > 0) { - warning(sprintf("Removing %d sample(s) that do not overlap in tables.", length(unshared.samples))) - } - - # Restrict each table to the shared sample set - tables <- lapply(tables, function(tbl) { - tbl[, shared.all.samples, drop = FALSE] - }) - shared.all.samples <- Reduce(intersect, lapply(tables, colnames)) - - # Transform tables: rCLR or masking - rclr_tables <- lapply(tables, function(tbl) { - mat <- as.matrix(tbl) - rown <- rownames(mat) - coln <- colnames(mat) - - if (transform == "rclr") { - mat[!is.finite(mat)] <- 0 - mat[mat < 0] <- 0 - out <- vegan::decostand(mat, method = "rclr", MARGIN = 2) - dimnames(out) <- list(rown, coln) - out - } else { - out <- .mask_value_only(mat)$data - dimnames(out) <- list(rown, coln) - out - } - }) - names(rclr_tables) <- names(tables) - - # Determine train/test split - if (!is.null(sample.metadata) && !is.null(train.test.column)) { - md <- as.data.frame(sample.metadata) - md <- md[shared.all.samples, , drop = FALSE] - train.samples <- rownames(md)[md[[train.test.column]] == "train"] - test.samples <- rownames(md)[md[[train.test.column]] == "test"] - } else { - ord.tmp <- .optspace_helper( - rclr.table = t(rclr_tables[[1]]), - feature.ids = rownames(rclr_tables[[1]]), - sample.ids = colnames(rclr_tables[[1]]), - n.components = n.components, - max.iterations = max.iterations, - tol = optspace.tol, - center = center, - scale = scale - )$ord_res - sorted.ids <- rownames(ord.tmp$samples[order(ord.tmp$samples[, 1]), ]) - idx <- round(seq(1, length(sorted.ids), length.out = n.test.samples)) - test.samples <- sorted.ids[idx] - train.samples <- setdiff(shared.all.samples, test.samples) - } - - # Run joint OptSpace - result <- .joint_optspace_helper( - tables = rclr_tables, - n.components = n.components, - max.iterations = max.iterations, - test.samples = test.samples, - train.samples = train.samples, - sample.order = shared.all.samples, - tol = optspace.tol, - center = center, - scale = scale - ) - - return(list( - ord_res = result$ord_res, - dist = result$dist, - cv_stats = result$cv_stats, - rclr_tables = rclr_tables - )) -} - -#' Joint RPCA Ordination Across Multiple Compositional Tables -#' -#' Internal function that performs Robust PCA via joint OptSpace decomposition across multiple compositional tables. -#' It splits each table into train/test sets, applies joint factorization, reconstructs sample and feature embeddings, -#' optionally projects test samples, computes a sample distance matrix, and returns cross-validation error statistics. -#' -#' @param tables A list of compositional matrices or data frames with features as rows and samples as columns. -#' @param n.components Number of principal components to compute. -#' @param max.iterations Maximum number of optimization iterations for OptSpace. -#' @param test.samples Character vector of sample IDs to be projected into the ordination space. -#' @param train.samples Character vector of sample IDs used to fit the ordination. -#' -#' @return A list with: -#' \describe{ -#' \item{ord_res}{An \code{OrdinationResults} object containing embeddings, loadings, and variance explained.} -#' \item{dist}{A \code{DistanceMatrix} object for sample embeddings.} -#' \item{cv_stats}{A data frame summarizing reconstruction error across iterations and tables.} -#' } -#' -#' @keywords internal -#' @noRd - -.joint_optspace_helper <- function(tables, - n.components, - max.iterations, - test.samples, - train.samples, - sample.order = NULL, - tol = 1e-5, - center = TRUE, - scale = FALSE) { - # Coerce to matrices and enforce colnames presence - tables <- lapply(tables, function(tbl) { - mat <- as.matrix(tbl) - if (is.null(colnames(mat))) { - stop("[.joint_optspace_helper] Input table is missing column names (sample IDs).") - } - mat - }) - - # Global set of samples present in all views - all_samples <- Reduce(intersect, lapply(tables, colnames)) - - # Align train/test to actually available samples - test.samples <- intersect(test.samples, all_samples) - train.samples <- intersect(train.samples, all_samples) - - if (!length(test.samples) || !length(train.samples)) { - stop("[.joint_optspace_helper] Empty train/test split after aligning sample IDs.") - } - - # Split and transpose training/test data per table - tables.split <- lapply(tables, function(tbl) { - list( - t(tbl[, test.samples, drop = FALSE]), - t(tbl[, train.samples, drop = FALSE]) - ) - }) - - # Format input for solver - tables.for.solver <- lapply(tables.split, function(pair) { - lapply(pair, as.matrix) - }) - - # Run joint OptSpace solver - opt.result <- .joint_optspace_solve( - train.test.pairs = tables.for.solver, - n.components = n.components, - max.iter = max.iterations, - tol = tol - ) - - U <- opt.result$U - S <- opt.result$S - V_list <- opt.result$V_list - dists <- opt.result$dists - - # Assign row/column names to loadings - pc.names <- paste0("PC", seq_len(n.components)) - - # Combine feature loadings with table-derived row names - vjoint <- do.call(rbind, Map(function(tbl, V) { - rownames(V) <- rownames(tbl) - colnames(V) <- pc.names - V - }, tables, V_list)) - - U <- U[seq_along(train.samples), , drop = FALSE] - rownames(U) <- train.samples - colnames(U) <- pc.names - - # Recenter & re-factor via SVD - X <- U %*% S %*% t(vjoint) - - if (center) { - X <- sweep(X, 2, colMeans(X)) - X <- sweep(X, 1, rowMeans(X)) - } - if (scale) { - X <- scale(X, center = FALSE, scale = TRUE) - } - svd.res <- svd(X) - u <- svd.res$u[, seq_len(n.components), drop = FALSE] - v <- svd.res$v[, seq_len(n.components), drop = FALSE] - s.eig <- svd.res$d[seq_len(n.components)] - - rownames(u) <- train.samples - rownames(v) <- rownames(vjoint) - pc.names <- paste0("PC", seq_len(n.components)) - colnames(u) <- colnames(v) <- pc.names - - # Build a named per-view features list - features_list <- lapply(seq_along(tables), function(i) { - rid <- rownames(tables[[i]]) - v[rid, , drop = FALSE] - }) - names(features_list) <- names(tables) - - prop.exp <- s.eig^2 / sum(s.eig^2) - ord_res <- .ordination_results( - method = "rpca", - eigvals = setNames(s.eig, pc.names), - samples = u, - features = features_list, - proportion.explained = setNames(prop.exp, pc.names) - ) - - # Project test samples - if (length(test.samples) > 0) { - test.matrices <- lapply(tables, function(tbl) tbl[, test.samples, drop = FALSE]) - names(test.matrices) <- names(tables) - ord_res <- .transform(ord_res, test.matrices, apply.rclr = FALSE) - } - - # Compute distance matrix and CV error summary - dist.base <- as.matrix(dist(ord_res$samples)) - - if (!is.null(sample.order)) { - order_use <- intersect(sample.order, rownames(dist.base)) - dist.mat <- dist.base[order_use, order_use, drop = FALSE] - } else { - dist.mat <- dist.base - order_use <- rownames(dist.base) - } - - dist.res <- .distance_matrix(dist.mat, ids = order_use) - - cv.dist <- data.frame(t(dists)) - colnames(cv.dist) <- c("mean_CV", "std_CV") - cv.dist$run <- sprintf( - "tables_%d.n.components_%d.max.iterations_%d.n.test_%d", - length(tables), n.components, max.iterations, length(test.samples) - ) - cv.dist$iteration <- seq_len(nrow(cv.dist)) - rownames(cv.dist) <- seq_len(nrow(cv.dist)) - - return(list(ord_res = ord_res, dist = dist.res, cv_stats = cv.dist)) -} - -#' Apply Projection of New Compositional Tables to Existing Ordination -#' -#' Internal function that transforms and projects new sample tables into an existing Joint RPCA ordination space. -#' It handles feature alignment, optional rCLR preprocessing, padding of missing features, -#' and merges projected samples with existing ones. -#' -#' @param ordination A list containing previous ordination results: `samples`, `features`, and `eigvals`. -#' @param tables A named list of new compositional tables (matrices or data frames) -#' with features as rows and samples as columns. -#' @param apply.rclr Logical; whether to apply rCLR transformation to new input tables before projection. Default is `TRUE`. -#' -#' @return An updated ordination list with the `samples` matrix extended to include new projected samples. -#' @keywords internal -#' @noRd - -.transform <- function(ordination, tables, - apply.rclr = TRUE) { - Udf <- ordination$samples - Vobj <- ordination$features - s.eig <- ordination$eigvals - - # Ensure tables is a list of views - if (!is.list(tables)) { - stop("[.transform] 'tables' must be a list of view matrices (features x samples).") - } - - if (is.list(Vobj) && !is.null(names(Vobj))) { - if (is.null(names(tables))) { - names(tables) <- names(Vobj)[seq_along(tables)] - } - if (is.null(names(tables))) { - stop("[.transform] 'tables' must be a *named* list of view matrices (features x samples).") - } - } - - # rCLR if requested - prep_view <- function(tab) { - mat <- as.matrix(tab) - rown <- rownames(mat) - coln <- colnames(mat) - - if (apply.rclr) { - mat <- vegan::decostand(mat, method = "rclr", MARGIN = 2) - - dimnames(mat) <- list(rown, coln) - } - - storage.mode(mat) <- "double" - mat[!is.finite(mat)] <- 0 - mat - } - tables <- lapply(tables, prep_view) - - if (is.matrix(Vobj)) { - all.features <- rownames(Vobj) - tables <- lapply(tables, function(mat) { - miss <- setdiff(all.features, rownames(mat)) - if (length(miss)) { - pad <- matrix(0, - nrow = length(miss), ncol = ncol(mat), - dimnames = list(miss, colnames(mat)) - ) - mat <- rbind(mat, pad) - } - mat[all.features, , drop = FALSE] - }) - proj.mat <- do.call(cbind, tables) - colnames(proj.mat) <- make.unique(colnames(proj.mat), sep = "_") - ordination$samples <- .transform_helper(Udf, Vobj, s.eig, proj.mat) - return(ordination) - } - - # 3+-omic path: V is a named list per view - if (!is.list(Vobj) || is.null(names(Vobj))) { - stop("[.transform] ordination$features is neither a matrix nor a named list.") - } - - # Intersect views by name, preserve training order - views <- intersect(names(Vobj), names(tables)) - if (!length(views)) stop("[.transform] No overlapping view names between ordination and new tables.") - - test.matrices <- list() - for (vw in views) { - Vvw <- Vobj[[vw]] - stopifnot(is.matrix(Vvw), !is.null(rownames(Vvw))) - mat <- tables[[vw]] - - train_feats <- rownames(Vvw) - miss <- setdiff(train_feats, rownames(mat)) - if (length(miss)) { - pad <- matrix(0, - nrow = length(miss), ncol = ncol(mat), - dimnames = list(miss, colnames(mat)) - ) - mat <- rbind(mat, pad) - } - mat <- mat[train_feats, , drop = FALSE] - - test.matrices[[vw]] <- mat - } - - ordination$samples <- .transform_helper(Udf, Vobj, s.eig, test.matrices) - return(ordination) -} - -#' Project New Data into Existing Ordination Space -#' -#' Internal function to align rCLR-transformed samples to an existing RPCA ordination space. -#' Handles feature alignment, deduplication of sample names, double-centering normalization, -#' and projection into low-rank space using previously learned components. -#' -#' @param Udf Matrix of training sample embeddings (samples × components). -#' @param Vdf Matrix or list of feature loadings (features × components, or per-view list). -#' @param s.eig Singular values from the RPCA decomposition. -#' @param table.rclr.project New rCLR-transformed table(s) for projection (features × samples). -#' If `Vdf` is a matrix, provide a single matrix; if `Vdf` is a list, provide a named list of matrices per view. -#' @param dedup.samples Logical; whether to merge samples with identical names (e.g. suffixes like `_1`, `_2`) -#' by averaging their feature values. Default is `TRUE`. Set to `FALSE` to preserve all duplicate sample IDs. -#' -#' @return A combined matrix of training and projected samples (samples × components). -#' @keywords internal -#' @noRd - -.transform_helper <- function(Udf, Vdf, s.eig, table.rclr.project, - dedup.samples = TRUE) { - # Legacy path (single view) - if (is.matrix(Vdf)) { - stopifnot(is.matrix(table.rclr.project)) - # Align rows by name - common <- intersect(rownames(Vdf), rownames(table.rclr.project)) - if (length(common) < ncol(Udf)) { - stop(sprintf("[.transform_helper] Too few matching features: %d", length(common))) - } - - M <- t(as.matrix(table.rclr.project[common, , drop = FALSE])) - V <- as.matrix(Vdf[common, , drop = FALSE]) - - # Dedup of sample IDs - if (dedup.samples) { - sid <- sub("_\\d+$", "", rownames(M)) - if (any(duplicated(sid))) { - M <- rowsum(M, group = sid, reorder = FALSE) / as.vector(table(sid)) - } else { - rownames(M) <- sid - } - } - - # Projection (match training scaling) - Uproj <- M %*% V - # Scale by singular values - if (length(s.eig)) { - Sinv <- diag(1 / s.eig, nrow = length(s.eig)) - Uproj <- Uproj %*% Sinv - } - - colnames(Uproj) <- colnames(Udf) - U.combined <- rbind(Udf[setdiff(rownames(Udf), rownames(Uproj)), , drop = FALSE], Uproj) - return(U.combined) - } - - # Multi-view path (named lists) - stopifnot(is.list(Vdf), is.list(table.rclr.project)) - views <- intersect(names(Vdf), names(table.rclr.project)) - if (!length(views)) stop("[.transform_helper] No overlapping views.") - - # Project per view, then sum contributions in the shared latent space - Usum <- NULL - ncomp <- ncol(Udf) - for (vw in views) { - Vvw <- Vdf[[vw]] - Tvw <- table.rclr.project[[vw]] - stopifnot(is.matrix(Vvw), is.matrix(Tvw)) - - common <- intersect(rownames(Vvw), rownames(Tvw)) - if (length(common) < ncomp) { - stop(sprintf("[.transform_helper] View '%s': too few matching features (%d).", vw, length(common))) - } - - M <- t(as.matrix(Tvw[common, , drop = FALSE])) - V <- as.matrix(Vvw[common, , drop = FALSE]) - - # Accumulate per-view U - Uvw <- M %*% V - if (is.null(Usum)) { - Usum <- Uvw - } else { - # Align rows (samples) by name before summing - all_s <- union(rownames(Usum), rownames(Uvw)) - Utmp <- matrix(0, - nrow = length(all_s), ncol = ncol(Udf), - dimnames = list(all_s, colnames(Udf)) - ) - Utmp[rownames(Usum), ] <- Usum - Utmp[rownames(Uvw), ] <- Utmp[rownames(Uvw), ] + Uvw - Usum <- Utmp - } - } - - # Sample dedup (after combining views) - if (dedup.samples) { - sid <- sub("_\\d+$", "", rownames(Usum)) - if (any(duplicated(sid))) { - Usum <- rowsum(Usum, group = sid, reorder = FALSE) / as.vector(table(sid)) - } else { - rownames(Usum) <- sid - } - } - - # Scale by S - if (length(s.eig)) { - Sinv <- diag(1 / s.eig, nrow = length(s.eig)) - Usum <- Usum %*% Sinv - } - colnames(Usum) <- colnames(Udf) - - # Merge with training U, avoiding duplicates - keep_train <- setdiff(rownames(Udf), rownames(Usum)) - rbind(Udf[keep_train, , drop = FALSE], Usum) -} - -#' RPCA Table Filtering and Preprocessing -#' -#' Internal function that performs filtering and cleanup on a compositional data table -#' prior to Robust PCA analysis. Removes low-count features/samples, enforces non-zero frequency thresholds, -#' checks for ID duplication, and returns a matrix suitable for transformation and ordination. -#' -#' @param table A matrix or data frame with features as rows and samples as columns. -#' @param min.sample.count Minimum total count required for a sample to be retained. Default is 0. -#' @param min.feature.count Minimum total count required for a feature to be retained. Default is 0. -#' @param min.feature.frequency Minimum percentage (0–100) of samples in which a feature must be non-zero. Default is 0. -#' -#' @return A filtered numeric matrix containing non-empty features and samples. -#' @keywords internal -#' @noRd - -.rpca_table_processing <- function(table, - min.sample.count = 0, - min.feature.count = 0, - min.feature.frequency = 0) { - # Ensure the input is a matrix - if (is.data.frame(table)) { - table <- as.matrix(table) - } - - n.features <- nrow(table) - n.samples <- ncol(table) - - # Filter features by total count - if (!is.null(min.feature.count)) { - feature.totals <- rowSums(table, na.rm = TRUE) - keep.features <- feature.totals > min.feature.count - table <- table[keep.features, , drop = FALSE] - } - - # Filter features by frequency across samples - if (!is.null(min.feature.frequency)) { - freq.threshold <- min.feature.frequency / 100 - feature.freq <- rowMeans(table > 0, na.rm = TRUE) - keep.features <- feature.freq > freq.threshold - table <- table[keep.features, , drop = FALSE] - } - - # Filter samples by total count - if (!is.null(min.sample.count)) { - sample.totals <- colSums(table, na.rm = TRUE) - keep.samples <- sample.totals > min.sample.count - table <- table[, keep.samples, drop = FALSE] - } - - # Check for duplicate IDs - if (any(duplicated(colnames(table)))) { - stop("Data table contains duplicate sample (column) IDs.", call. = FALSE) - } - if (any(duplicated(rownames(table)))) { - stop("Data table contains duplicate feature (row) IDs.", call. = FALSE) - } - - # Remove empty rows and columns if sample filtering applied - if (!is.null(min.sample.count)) { - nonzero.features <- rowSums(table, na.rm = TRUE) > 0 - nonzero.samples <- colSums(table, na.rm = TRUE) > 0 - table <- table[nonzero.features, nonzero.samples, drop = FALSE] - } - - return(table) -} - -#' Generate a MaskedMatrix from Numeric Input -#' -#' Internal helper that ensures input is a 2D numeric matrix and returns a masked version, -#' replacing non-finite values (e.g., NA, NaN, Inf) with `NA` and recording their positions in a logical mask. -#' -#' @param mat A numeric matrix or vector. If a vector, it will be converted to a 1-row matrix. -#' -#' @return A list of class \code{"MaskedMatrix"} with two elements: -#' \describe{ -#' \item{data}{The original matrix with non-finite values replaced by \code{NA}.} -#' \item{mask}{Logical matrix indicating non-finite entries (TRUE if missing).} -#' } -#' -#' @keywords internal -#' @noRd - -.mask_value_only <- function(mat) { - # Ensure matrix is at least 2D - if (is.vector(mat)) { - mat <- matrix(mat, nrow = 1) - } - - # Ensure matrix is not more than 2D - if (length(dim(mat)) > 2) { - stop("Input matrix can only have two dimensions or less") - } - - # Generate logical mask: TRUE where values are missing - mask <- !is.finite(mat) - - # Create masked matrix - masked.mat <- mat - masked.mat[!is.finite(mat)] <- NA - - # Return as a masked matrix - return(structure(list( - data = masked.mat, - mask = mask - ), class = "MaskedMatrix")) -} - -#' Internal constructor for ordination results -#' @keywords internal -#' @noRd -.ordination_results <- function(method, eigvals, samples, features, - proportion.explained, dist = NULL, metadata = list()) { - return(structure(list( - method = method, - eigvals = eigvals, - samples = samples, - features = features, - proportion.explained = proportion.explained, - dist = dist, - metadata = metadata - ), class = "OrdinationResults")) -} - -#' Internal constructor for a distance matrix object -#' @keywords internal -#' @noRd -.distance_matrix <- function(matrix, ids = NULL, method = "euclidean") { - if (!is.matrix(matrix)) stop("Input must be a matrix.") - if (!isSymmetric(matrix)) stop("Distance matrix must be symmetric.") - if (!is.null(ids)) { - if (length(ids) != nrow(matrix)) stop("Length of 'ids' must match matrix dimensions.") - rownames(matrix) <- ids - colnames(matrix) <- ids - } - return(structure(list( - data = matrix, - ids = rownames(matrix), - method = method - ), class = "DistanceMatrix")) -} - -#' Extract per-experiment assay tables from a MultiAssayExperiment -#' -#' @param x A MultiAssayExperiment. -#' @param experiments Character vector of experiment names (or NULL for all). -#' -#' @return A list with `tables`, `experiments`, and `assay_names_used`. -#' -#' @keywords internal -#' @noRd -.extract_mae_tables <- function(x, experiments = NULL) { - exps <- experiments(x) - - if (is.null(experiments)) { - experiments <- names(exps) - } - if (length(experiments) == 0L) { - stop("No experiments found in 'x'.", call. = FALSE) - } - - assay_names_used <- setNames(character(length(experiments)), experiments) - tables <- vector("list", length(experiments)) - names(tables) <- experiments - - for (i in seq_along(experiments)) { - e <- experiments[[i]] - exp_se <- exps[[e]] - if (is.null(exp_se)) { - stop(sprintf("Experiment '%s' not found in 'x'.", e), call. = FALSE) - } - - anm <- assayNames(exp_se) - default_assay <- if (length(anm)) anm[[1]] else 1L - - assay_names_used[[e]] <- if (is.character(default_assay)) default_assay else as.character(default_assay) - tables[[e]] <- assay(exp_se, default_assay) - } - - list( - tables = tables, - experiments = experiments, - assay_names_used = assay_names_used - ) -} - -#' OptSpace back-end (joint & single-view) -#' -#' OptSpace-based solvers built on top of \code{vegan::optspace()}. -#' -#' This file contains: -#' \itemize{ -#' \item \code{.optspace_helper()} — single-view rCLR → OptSpace → biplot -#' \item \code{.joint_optspace_solve()} — multi-view joint factorization -#' } -#' -#' @keywords internal -#' @noRd -NULL - -#' OptSpace-Based Dimensionality Reduction and RPCA Biplot Generation -#' -#' Internal function that fits an OptSpace model to a rCLR-transformed compositional table, -#' reconstructs the low-rank matrix, applies PCA, and constructs an ordination result -#' capturing sample embeddings, feature loadings, and explained variance. A distance matrix -#' is also generated using Aitchison geometry. -#' -#' @param rclr.table A numeric matrix representing rCLR-transformed compositional data. -#' @param feature.ids Character vector of feature names (used for row labeling of loadings). -#' @param sample.ids Character vector of sample names (used for row labeling of embeddings). -#' @param n.components Integer specifying number of principal components to retain. Default is 3. -#' @param max.iterations Maximum number of iterations to run OptSpace optimization. Default is 5. -#' -#' @return A list with: -#' \describe{ -#' \item{ord_res}{An \code{OrdinationResults} object containing PCA scores, loadings, and metadata.} -#' \item{dist}{A sample-by-sample \code{DistanceMatrix} object using Aitchison geometry.} -#' \item{opt_fit}{The raw OptSpace fit result containing matrices \code{X}, \code{Y}, and \code{S}.} -#' } -#' -#' @keywords internal -#' @noRd - -.optspace_helper <- function(rclr.table, - feature.ids, - sample.ids, - n.components = 3, - max.iterations = 5, - tol = 1e-5, - center = TRUE, - scale = FALSE) { - opt.result <- vegan::optspace( - x = rclr.table, - ropt = n.components, - niter = max.iterations, - tol = tol, - verbose = FALSE - ) - - n.components <- ncol(opt.result$S) - - # Reconstruct and re-center matrix - X.hat <- opt.result$X %*% opt.result$S %*% t(opt.result$Y) - X.hat <- scale(X.hat, center = center, scale = scale) - X.hat <- t(scale(t(X.hat), center = center, scale = scale)) - - # PCA - svd.out <- svd(X.hat) - u <- svd.out$u[, 1:n.components, drop = FALSE] - s <- svd.out$d[1:n.components] - v <- svd.out$v[, 1:n.components, drop = FALSE] - - # Label loadings - rename.cols <- paste0("PC", seq_len(n.components)) - sample.scores <- u - feature.scores <- data.frame(v, row.names = feature.ids) - feature.scores <- as.matrix(feature.scores) - rownames(sample.scores) <- sample.ids - colnames(sample.scores) <- rename.cols - colnames(feature.scores) <- rename.cols - - # Proportion explained - prop.var <- s^2 / sum(svd.out$d^2) - names(prop.var) <- rename.cols - names(s) <- rename.cols - - # Add PC3 for 2D case - if (n.components == 2) { - sample.scores <- cbind(sample.scores, PC3 = 0) - feature.scores <- cbind(feature.scores, PC3 = 0) - sample.scores <- as.matrix(sample.scores) - feature.scores <- as.matrix(feature.scores) - - s <- c(s, PC3 = 0) - prop.var <- c(prop.var, PC3 = 0) - rename.cols <- c(rename.cols, "PC3") - } - - # Compute distance in sample PC-space - dist.matrix.raw <- as.matrix(dist(u)) - rownames(dist.matrix.raw) <- sample.ids - colnames(dist.matrix.raw) <- sample.ids - - dist.res <- .distance_matrix(dist.matrix.raw, - ids = sample.ids, - method = "aitchison" - ) - - ord_res <- .ordination_results( - method = "rpca_biplot", - eigvals = s, - samples = sample.scores, - features = feature.scores, - proportion.explained = prop.var, - dist = dist.matrix.raw, - metadata = list( - long.method.name = "(Robust Aitchison) RPCA Biplot", - run.id = sprintf( - "optspace_helper_n.components_%d.max.iterations_%d", - n.components, max.iterations - ) - ) - ) - - return(list( - ord_res = ord_res, - dist = dist.res, - opt_fit = opt.result - )) -} - -#' Joint OptSpace Optimization Across Multiple Train/Test Splits -#' -#' Internal function that performs joint matrix factorization using OptSpace across a set of paired train/test compositional tables. -#' Stacks training matrices horizontally, applies low-rank optimization, splits feature loadings per table, -#' and evaluates projection error on test data via Frobenius norm. -#' -#' @param train.test.pairs A list of paired matrices where each element is a two-item list: \code{[[test, train]]}. -#' @param n.components Integer specifying number of components to retain in the OptSpace model. -#' @param max.iter Maximum number of optimization iterations. Default is 50. -#' @param verbose Logical; whether to print progress messages. Default is \code{TRUE}. -#' -#' @return A list with: -#' \describe{ -#' \item{U}{Shared sample embedding matrix across all input tables.} -#' \item{S}{Singular values matrix from OptSpace decomposition.} -#' \item{V_list}{List of per-table feature loading matrices.} -#' \item{dists}{Matrix of reconstruction errors (rows: error type, columns: tables).} -#' } -#' -#' @keywords internal -#' @noRd - -.joint_optspace_solve <- function(train.test.pairs, n.components, - max.iter = 50, verbose = TRUE, - tol = 1e-5) { - # Prepare lists to hold training matrices and dimensions - train.matrices <- list() - test.matrices <- list() - dims <- list() - - for (pair in train.test.pairs) { - test.mat <- pair[[1]] - train.mat <- pair[[2]] - train.matrices <- append(train.matrices, list(train.mat)) - test.matrices <- append(test.matrices, list(test.mat)) - dims <- append(dims, list(dim(train.mat))) - } - - # Stack training matrices horizontally - train.stacked <- do.call(cbind, train.matrices) - - # Apply OptSpace to stacked matrix via vegan - if (verbose) { - message("Running vegan::optspace() on stacked training data...") - } - - fit <- vegan::optspace( - x = train.stacked, - ropt = n.components, - niter = max.iter, - tol = tol, - verbose = verbose - ) - - # Extract sample loadings - U.shared <- fit$X - S.shared <- fit$S - - # Split V back into per-table pieces - feat.indices <- cumsum(vapply(dims, function(d) d[2], numeric(1))) - feat.starts <- c(1, head(feat.indices, -1) + 1) - V_list <- Map( - function(start, end) fit$Y[start:end, , drop = FALSE], - feat.starts, feat.indices - ) - - # Reconstruction error per view (mean_CV / std_CV placeholder) - n_views <- length(test.matrices) - dists <- matrix(0, nrow = 2, ncol = n_views) - - for (i in seq_along(test.matrices)) { - V.k <- V_list[[i]] - test.mat <- test.matrices[[i]] - - # Project test samples: U.test = test × V - U.test <- as.matrix(test.mat) %*% V.k - U.test <- sweep(U.test, 2, diag(S.shared), "/") - recon.test <- U.test %*% S.shared %*% t(V.k) - - # Center for consistency - recon.test <- scale(recon.test, center = TRUE, scale = FALSE) - recon.test <- t(scale(t(recon.test), center = TRUE, scale = FALSE)) - - error <- test.mat - recon.test - error[is.na(error)] <- 0 - error.val <- norm(error, "F") / sqrt(sum(!is.na(test.mat))) - - dists[1, i] <- error.val - dists[2, i] <- 0 - } - - return(list(U = U.shared, S = S.shared, V_list = V_list, dists = dists)) -} diff --git a/R/getRPCA.R b/R/getRPCA.R new file mode 100644 index 000000000..e608e97c2 --- /dev/null +++ b/R/getRPCA.R @@ -0,0 +1,907 @@ +#' @name +#' getRPCA +#' +#' @title +#' Run (joint) robust principal component analysis (RPCA) +#' +#' @description +#' These functions implement robust principal component analysis (RPCA) +#' for single tables and joint RPCA for multiple tables. +#' \code{*RPCA} functions run RPCA for single table. \code{*JointRPCA} runs the +#' analysis jointly for multiple tables. \code{get*} functions return the +#' results of the analysis while \code{add*} adds them to the input object. +#' +#' @details +#' These functions perform robust principal component analysis (RPCA) using a +#' low-rank matrix approximation followed by principal component analysis. +#' Missing values are handled using matrix completion based on the OptSpace +#' algorithm. +#' +#' \strong{Single-table RPCA} +#' +#' For a single assay, the workflow is: +#' \enumerate{ +#' \item Extract the selected assay matrix +#' \item Estimate a low-rank approximation of the matrix using the OptSpace +#' algorithm, which reconstructs missing values and reduces noise. +#' \item Apply double centering (row and column centering). +#' \item Apply PCA (via singular value decomposition) to the centered matrix. +#' \item Return the sample coordinates in PCA space together with additional +#' information such as loadings, explained variance, and pairwise distances. +#' } +#' +#' \strong{Joint RPCA} +#' +#' When multiple assays are provided, joint RPCA estimates a shared low-rank +#' representation across all tables. Each table may contain different features, +#' but they must share the same samples. +#' +#' The workflow is: +#' \enumerate{ +#' \item Extract the selected experiments and assays. +#' \item Estimate a joint low-rank representation using a joint OptSpace +#' algorithm that learns shared sample factors while allowing +#' table-specific feature loadings. +#' \item Apply double centering (row and column centering). +#' \item Perform PCA on the centered matrix. to obtain the final +#' sample coordinates. +#' \item Return the sample coordinates in PCA space together with additional +#' information such as loadings, explained variance, and pairwise distances. +#' } +#' +#' To assess generalization, the joint RPCA procedure optionally splits the +#' samples into training and test sets. The low-rank model is learned using the +#' training data and the test samples are projected into the resulting PCA +#' space. Reconstruction error for the test set is returned as a measure of +#' model fit. +#' +#' The returned object contains the PCA sample scores as the main result, +#' with additional information (e.g., rotation matrix, explained variance, +#' reconstructed matrix, and reconstruction error) stored in attributes. +#' +#' +#' @return +#' \code{matrix}, \code{TreeSummarizedExperiment} or \code{MultiAssayExperiment} +#' object. +#' +#' @inheritParams addAlpha +#' +#' @param assay.type \code{Character scalar}. Specifies the name of assay +#' used in calculation. (Default: \code{"counts"}) +#' +#' @param experiments \code{Character vector} or \code{integer scalar}. Names +#' or indices of experiments selected from \code{x}. +#' +#' @param assay.types \code{Character vector}. Names assays selected from +#' \code{experiments}. +#' +#'@param name \code{Character scalar}. Name of results stored in \code{x}. +#'(Default: \code{"RPCA"} or \code{"JointRPCA"} depending on the method) +#' +#' @param ... additional arguments: +#' \itemize{ +#' \item \code{ncomponents}:\code{Integer scalar}. The number of components +#' to estimate. (Default: \code{3}) +#' \item \code{max.iterations}:\code{Integer scalar}. The number of iterations +#' run in OptSpace algorithm. (Default: \code{3}) +#' \item \code{tolerance}:\code{Numeric scalar}. Accepted error between +#' lower rank representation obtained by OptSpace algorithm and original +#' table. (Default: \code{3}) +#' } +#' +#' @examples +#' data("ibdmdb") +#' mae <- ibdmdb +#' +#' # Apply data transformations +#' mae[[1]] <- transformAssay(mae[[1]], assay.type = "mgx", method = "rclr") +#' mae[[2]] <- transformAssay(mae[[2]], assay.type = "mtx", method = "rclr") +#' +#' # Run joint-RPCA +#' res <- getJointRPCA( +#' mae, +#' experiments = c(1, 2), +#' assay.types = c("rclr", "rclr") +#' ) +#' +#' # Run RPCA for single experiment +#' res <- getRPCA(mae[[1]], assay.type = "rclr") +#' +#' @seealso +#' \code{\link[scater:runPCA]{scater::runPCA}} +#' +#' @references +#' +#' Martino, C. and Shenhav, L. et al. (2020) +#' Context-aware dimensionality reduction deconvolutes gut microbial community +#' dynamics. +#' _Nat. Biotechnol._ doi:10.1038/s41587-020-0660-7 +#' +NULL + +#' @rdname getRPCA +#' @export +setMethod("getRPCA", signature = c(x = "SingleCellExperiment"), + function(x, ...){ + x <- .check_and_get_altExp(x, ...) + res <- callNextMethod(x, ...) + return(res) + } +) + +#' @rdname getRPCA +#' @export +setMethod("getRPCA", signature = c(x = "SummarizedExperiment"), + function(x, assay.type = "counts", ...){ + .check_assay_present(assay.type, x) + mat <- assay(x, assay.type) |> t() + res <- .calculate_rpca(mat, ...) + return(res) + } +) + +#' @rdname getRPCA +#' @export +setMethod("addRPCA", signature = c(x = "SummarizedExperiment"), + function(x, name = "RPCA", ...){ + if( !.is_a_string(name) ){ + stop("'name' must be a single character value.", call. = FALSE) + } + # Hiddenly support altExp + x <- .check_and_get_altExp(x, ...) + # Calculate indices + args <- c(list(x = x), list(...)) + args <- args[ !names(args) %in% c("altexp") ] + res <- do.call(getRPCA, args) + # Add object to reducedDim + x <- .add_object_to_reduceddim(x, res, name = name, ...) + return(x) + } +) + +#' @rdname getRPCA + +#' @export +setMethod("getJointRPCA", signature = c(x = "MultiAssayExperiment"), + function(x, experiments, assay.types, ...){ + if( !(length(experiments) == length(assay.types) && + length(experiments) > 1L) ){ + stop("The lengths of 'experiments' and 'assay.types' must match ", + "and there must be multiple experiments selected.", + call. = FALSE) + } + mat_list <- .prepare_mae_for_joint_rpca(x, experiments, assay.types) + res <- .run_joint_rpca_analysis(mat_list, ...) + return(res) + } +) + +#' @rdname getRPCA +#' @export +setMethod("getJointRPCA", signature = c(x = "SingleCellExperiment"), + function(x, experiments, assay.types, ...){ + if( !(length(experiments) == length(assay.types) && + length(experiments) > 1L) ){ + stop("The lengths of 'experiments' and 'assay.types' must match ", + "and there must be multiple experiments selected.", + call. = FALSE) + } + mat_list <- .prepare_tse_for_joint_rpca(x, experiments, assay.types) + res <- .run_joint_rpca_analysis(mat_list, ...) + return(res) + } +) + +#' @rdname getRPCA +#' @export +setMethod("addJointRPCA", signature = c(x = "SingleCellExperiment"), + function(x, name = "JointRPCA", ...){ + if( !.is_a_string(name) ){ + stop("'name' must be a single character value.", call. = FALSE) + } + res <- getJointRPCA(x, ...) + x <- .add_object_to_reduceddim(x, res, name = name, ...) + return(x) + } +) + +#' @rdname getRPCA +#' @export +setMethod("addJointRPCA", signature = c(x = "MultiAssayExperiment"), + function(x, name = "JointRPCA", ...){ + if( !.is_a_string(name) ){ + stop("'name' must be a single character value.", call. = FALSE) + } + res <- getJointRPCA(x, ...) + x <- .add_values_to_mae_metadata(x, res, names = name, ...) + return(x) + } +) + +################################ HELP FUNCTIONS ################################ + +# This function retrieves specific tables from MAE +#' @importFrom MultiAssayExperiment intersectColumns +.prepare_mae_for_joint_rpca <- function(x, experiments, assay.types, ...){ + # Select experiments from MAE + x <- .select_experiments(x, experiments) + # Select samples that are shared between experiments + x <- intersectColumns(x) + # Select alternative experiments + x <- .select_altexps(x, ...) + # Select assays of each experiment + x <- .select_assays_from_mae(x, assay.types) + # Get tables as a list + mat_list <- MultiAssayExperiment::assays(x) + # Change orientation so that samples are in rows + mat_list <- lapply(mat_list, t) + return(mat_list) +} + +# This function retrieves specific tables from TreeSE +.prepare_tse_for_joint_rpca <- function(x, experiments, assay.types){ + # Select experiments from TreeSE + tse_list <- .select_experiments_from_tse(x, experiments) + # Select assays of each experiment + tse_list <- .select_assays_from_tse_list(tse_list, assay.types) + # Get tables as a list + mat_list <- lapply(tse_list, assay) + # Change orientation so that samples are in rows + mat_list <- lapply(mat_list, t) + return(mat_list) +} + +# This function calculates Joint-RPCA for multiple tables. It runs the whole +# analysis from test/train set split to projecting the results to test set. +.run_joint_rpca_analysis <- function(mat_list, test.set = NULL, ...){ + if( !(is.null(test.set) || is.character(test.set)) ){ + stop(".") + } + # Determine train/test split. User can define test set samples with vector + # or then we can select representative samples based on RPCA of first table. + all_samples <- mat_list[[1L]] |> rownames() + if( !is.null(test.set) ){ + test_samples <- which( all_samples %in% test.set ) + } else{ + test_samples <- .determine_test_set_for_rpca(mat_list[[1L]], ...) + } + all_index <- all_samples |> length() |> seq_len() + train_samples <- all_index[ !all_index %in% test_samples ] + + # Split tables to train and test sets + train_set <- lapply(mat_list, function(x){ + x[train_samples, , drop = FALSE] + }) + test_set <- lapply(mat_list, function(x){ + x[test_samples, , drop = FALSE] + }) + # If user did not specify test set, use training set as test set + if( all(lengths(test_set) == 0L) ){ + test_set <- train_set + } + + # Calculate RPCA + res <- .calculate_joint_rpca(train_set, test_set, ...) + # Project test samples to PCA space determined by train set + test_mat <- do.call(cbind, test_set) + projected <- .project_test_set_to_rpca(res, test_mat) + # Add test samples to pca results + attr_list <- attributes(res) + res <- rbind(res, projected) + # Sort back to original order + res <- res[ + order(c(train_samples, test_samples)), , drop = FALSE] + # Add additional info back + attr_list <- c(attributes(res), attr_list) + attr_list <- attr_list[ !duplicated(names(attr_list)) ] + attributes(res) <- attr_list + + # Calculate error between low rank and original test set + num_features <- vapply(mat_list, ncol, numeric(1L)) + reconstruct_error <- .calculate_reconstruct_error( + res, test_set, num_features) + attributes(res)[["reconstruct_error"]] <- reconstruct_error + + return(res) +} + +# This function runs RPCA to single table +#' @importFrom stats dist +.calculate_rpca <- function(mat, ncomponents = 3L, ...){ + # Get lower rank representation of the data + opt_results <- .get_lower_rank_mat(mat, ncomponents = ncomponents, ...) + # The result might have lower number of columns if they were not able to be + # estimated. + ncomponents <- opt_results[["raw"]][["S"]] |> ncol() + # Apply pca to lower rank representation + pca_results <- .calculate_pca( + opt_results[["matrix"]], ncomponents = ncomponents) + # Calculate distance in PCA space + distance <- pca_results[["sample_scores"]] |> dist() + # Create a final results to return to user + res <- .construct_rpca_result(pca_results, opt_results, distance) + return(res) +} + +# This function runs Joint-RPCA. The only difference to .calculate_rpca is that +# the lower dimension matrix is estimated by optimizing feature loadings +# separately (sample loadings and singular values are estimated jointly). +#' @importFrom stats dist +.calculate_joint_rpca <- function(mat_list, test_set, ncomponents = 3L, ...){ + # Get lower rank representation of the data + opt_results <- .get_lower_rank_joint_mat( + mat_list, test_set, ncomponents = ncomponents, ...) + # The result might have lower number of columns if they were not able to be + # estimated. + ncomponents <- opt_results[["raw"]][["S"]] |> ncol() + # Apply pca to lower rank representation + pca_results <- .calculate_pca( + opt_results[["matrix"]], ncomponents = ncomponents) + # Calculate distance in PCA space + distance <- pca_results[["sample_scores"]] |> dist() + # Create a final results to return to user + pca_result <- .construct_rpca_result(pca_results, opt_results, distance) + return(pca_result) +} + +# This function constructs a lower rank representation from the data. The idea +# is to extract the essential from the data and to remove noise. +#' @importFrom vegan optspace +.get_lower_rank_mat <- function( + mat, + ncomponents = pmin(3L, nrow(mat), ncol(mat)), + max.iterations = 5L, + tolerance = 1e-5, + ...){ + # Create lower rank representation + opt_result <- optspace( + x = mat, + ropt = ncomponents, + niter = max.iterations, + tol = tolerance, + verbose = FALSE + ) + + # Reconstruct the matrix + X_hat <- opt_result[["X"]] %*% opt_result[["S"]] %*% t(opt_result[["Y"]]) + # Add old feature and sample names as they are dropped off + dimnames(X_hat) <- dimnames(mat) + + # Create a result list + res <- list( + matrix = X_hat, + raw = opt_result + ) + + return(res) +} + +# Construct lower rank representation from a list of matrices. +.get_lower_rank_joint_mat <- function( + mat_list, mat_list_test, + ncomponents = pmin(3L, nrow(mat_list[[1L]]), ncol(mat_list[[1L]])), + max.iterations = 5L, + ...){ + # Create lower rank representation + opt_result <- .joint_optspace( + x = mat_list, + x_test = mat_list_test, + ropt = ncomponents, + niter = max.iterations + ) + + # Reconstruct the matrix + X_hat <- opt_result[["X"]] %*% opt_result[["S"]] %*% t(opt_result[["Y"]]) + # Add old feature and sample names as they are dropped off + nams <- do.call(cbind, mat_list) |> dimnames() + dimnames(X_hat) <- nams + + # Create a result list + res <- list( + matrix = X_hat, + raw = opt_result + ) + + return(res) +} + +# This function applies PCA to the data. +.calculate_pca <- function(mat, ncomponents){ + # Double center the data + mat <- .apply_double_centering(mat) + + # Run PCA + svd_result <- mat |> svd() + u <- svd_result[["u"]] + s <- svd_result[["d"]] + v <- svd_result[["v"]] + + # We multiply U by singular values so that the sample coordinates reflect + # actual variance magnitude rather than just orthonormal directions. + # u <- u %*% diag(s) + + # Subset. There might be more components than requested. + u <- u[ , seq_len(ncomponents), drop = FALSE] + s <- s[ seq_len(ncomponents) ] + v <- v[ , seq_len(ncomponents), drop = FALSE] + + # Adjust dimnames + names(s) <- paste0("PC", s |> length() |> seq_len()) + dimnames(u) <- list(rownames(mat), names(s)) + dimnames(v) <- list(colnames(mat), names(s)) + + # Create a result list + pca_results <- list( + sample_scores = u, + varExplained = s, + rotation = v, + center = attributes(mat)[["center"]] + ) + + return(pca_results) +} + +# This function constructs a final result to user. It does not calculate, but +# re-structures the results to returned format. +.construct_rpca_result <- function(pca_results, opt_results, distance){ + # We return the PCA sample scores as main results + mat <- pca_results[["sample_scores"]] + attr_list <- pca_results[ c("varExplained", "rotation", "center", "scale") ] + + # Calculate the explained variance in percentages + percent_var <- pca_results[["varExplained"]]^2 / + sum(pca_results[["varExplained"]]^2) * 100 + attr_list[["percentVar"]] <- percent_var + + # Add the lower rank representation and distance to result list + attr_list[["lower_dim"]] <- opt_results[["matrix"]] + attr_list <- c(attr_list, opt_results[["raw"]]) + attr_list[["distance"]] <- distance + + # Add additional results to main result matrix + attr_list <- c(attributes(mat), attr_list) + attributes(mat) <- attr_list + + return(mat) +} + +# This function selects a representative set of samples to test set. This is +# done by applying RPCA for the first table and selecting samples from PC1 with +# a highest variance. +.determine_test_set_for_rpca <- function( + mat, test.ratio = 0.2, ...){ + # Select number of samples + test_n <- ceiling(test.ratio * nrow(mat)) + test_samples <- c() + if( test_n > 0L ){ + # Calculate RPCA + pca_result <- .calculate_rpca(mat, ...) + # Select N samples so that they span over the PC1 axis. Idea is that + # these samples should represent the dataset the best as there are + # maximally different samples. + first_component <- pca_result[, 1] |> sort() + test_samples <- seq(1, length(first_component), length.out = test_n) |> + round() + test_samples <- names(first_component)[ test_samples ] + test_samples <- match(test_samples, rownames(pca_result)) + } + return(test_samples) +} + +# This function projects test set samples to PCA space that were obtained with +# train set. +.project_test_set_to_rpca <- function(pca_result, mat){ + # Extract PCA components + feature_scores <- attributes(pca_result)[["rotation"]] + singular_values <- attributes(pca_result)[["varExplained"]] + + center <- attributes(pca_result)[["center"]] + # Row centering (new samples) + mat <- sweep(mat, 1L, rowMeans(mat), "-") + # Column centering (training means) + # mat <- sweep(mat, 2L, center[["col"]], "-") + mat <- sweep(mat, 2L, colMeans(mat), "-") + # Add grand mean to avoid subtracting the mean twice (training means) + # mat <- mat + center[["grand"]] + + # Project into PCA space + projected <- mat %*% feature_scores + + # Normalize based on singular values + projected <- projected / sqrt(sum(singular_values^2)) + + return(projected) +} + +# This function calculates error between raw test set values and their +# lower rank representation. The lower rank representation is calculated based +# on parameters learned from train set. The idea is to assess, how well the +# lower rank representation learns the generic, generalizable patterns from the +# data. +.calculate_reconstruct_error <- function(res, test_set, num_features){ + # Get learned parameters + u_shared <- attributes(res)[["X"]] + s_shared <- attributes(res)[["S"]] + y_shared <- attributes(res)[["Y"]] + + # Split feature loadings by table + ends <- cumsum(num_features) + starts <- c(1, head(ends, -1) + 1) + y_individual <- mapply(function(s, e) { + y_shared[s:e, ] + }, starts, ends) + + # Calculate error separately for each table + errors_per_set <- vapply(seq_len(length(test_set)), function(i){ + # Calculate lower rank representation + test_mat <- test_set[[i]] + u_test <- test_mat %*% y_individual[[i]] + u_test <- sweep(u_test, 2, diag(s_shared), "/") + recon_test <- u_test %*% s_shared %*% t(y_individual[[i]]) + # Calculate error between actual values and lower rank representation + error <- test_mat - recon_test + error[is.na(error)] <- 0 + error <- norm(error, "F") / sqrt(sum(!is.na(test_mat))) + return(error) + }, numeric(1L)) + names(errors_per_set) <- names(num_features) + + return(errors_per_set) +} + +# ----------------------------------------------------------------------------- +# .joint_optspace +# +# Run the *joint OptSpace* matrix completion algorithm on multiple tables that +# share the same rows (samples) but may contain different feature sets. +# +# Goal +# ---- +# Estimate a shared low-rank representation across several matrices with +# missing values. Each matrix shares the same sample space (rows) but has +# its own features (columns). The algorithm learns: +# +# X (U_shared) : shared sample scores +# S (S_shared) : singular values (component scaling) +# Y (V_individual) : feature loadings for each matrix (table-specific) +# +# Model approximation: +# +# M_i ≈ U_shared %*% S_shared %*% t(V_i) +# +# where M_i is table i and V_i are feature loadings specific to that table. +# +# References used for implementation guidance: +# - gemelli implementation: https://github.com/biocore/gemelli/blob/00e3993f7358006d99a481ad281ee5801c6e159e/gemelli/optspace.py#L200 +# - vegan::optspace: https://github.com/vegandevs/vegan/blob/5f992e14f1d644d1734195a30042f18028d090b1/R/optspace.R#L12 +# +# Returns +# ------- +# list( +# X = sample scores (U_shared) +# S = singular value matrix +# Y = stacked feature loadings +# cv_error = cross-validation error +# ) +# ----------------------------------------------------------------------------- +# +.joint_optspace <- function(x, x_test = x, ropt = 3, niter = 5){ + # Validate input + if( !(is.list(x) && all(vapply( + x, function(mat) is.matrix(mat) || is.data.frame(mat), + logical(1L))) ) ){ + stop("'x' must be a list of matrices.", call. = FALSE) + } + if( vapply(x, nrow, integer(1L)) |> unique() |> length() != 1L ){ + stop("All tables must have equal number of samples (rows).", + call. = FALSE) + } + if( do.call(cbind, x) |> is.infinite() |> any() ){ + stop("Infinite values are not allowed.", call. = FALSE) + } + if( !.is_an_integer(niter) ){ + stop("'niter' must be a single integer value.", call. = FALSE) + } + if( !.is_an_integer(ropt) ){ + stop("'ropt' must be a single integer value.", call. = FALSE) + } + # + # Input tables can be matrices also, so make sure that they are now + # matrices. + x <- lapply(x, as.matrix) + x_test <- lapply(x_test, as.matrix) + # Number of input tables + n_tables <- x |> length() + # Number of samples (rows). Guaranteed equal across tables. + n_samples <- x[[1]] |> nrow() + # Number of features per table + n_features <- vapply(x, ncol, integer(1L)) + # Total number of features after stacking tables + total_n_features <- n_features |> sum() + # Smallest feature count among tables + # Used for validating maximum rank. + min_feat <- n_features |> min() + # ropt cannot exceed number of samples or smallest feature dimension + if( (ropt < 1) || (ropt > min_feat) || (ropt > n_samples) ){ + stop("'ropt' must be integer in [1, ", + min(n_samples, min_feat), + "]", call. = FALSE) + } + + # Prepare tables for optspace. In first table convert all NA values to 0. + observed_list <- lapply(x, function(mat){ + mat[ is.na(mat) ] <- 0 + return(mat) + }) + # The second table shows which cells included a value (were not NA). + mask_list <- lapply(x, function(mat){ + mask <- !is.na(mat) + storage.mode(mask) <- "integer" + return(mask) + }) + + # Stack all tables column-wise into a single matrix. We will use these + # tables in initialization and calculating + observed_stacked <- do.call(cbind, observed_list) + mask_stacked <- do.call(cbind, mask_list) + + # eps is used to rescale the observed matrix to account for missing entries. + total_non_zeroes <- mask_stacked |> sum() + eps <- total_non_zeroes / sqrt(total_n_features * n_samples) + # rho is gradient scaling factor. Helps stabilize gradient descent when + # matrices are sparse. + rho <- eps * n_samples + + # When many entries are missing, the magnitude of observed entries tends + # to underestimate the magnitude of the full matrix. We compensate + # for this by rescaling the observed matrix before optimization. + # + # The scale factor is reversed at the end of the algorithm. + rescale_param <- sum(mask_stacked) * ropt + rescale_param <- sqrt(rescale_param / (norm(observed_stacked, "F")^2)) + observed_stacked <- rescale_param * observed_stacked + + # We make initial guess for U_shared, S_shared and V. SVD with stacked data + # gives sensible starting point for gradient descent. + init_res <- .initialize_joint_optspace( + observed_stacked, observed_list, mask_stacked, mask_list, ropt, eps, + n_samples, n_features) + U_shared <- init_res[["U_shared"]] + S_shared <- init_res[["S_shared"]] + V_list <- init_res[["V_list"]] + + # Now we start iteratively refine U_shared, S_shared and V + cv_errors <- data.frame(mean = numeric(0L), sd = numeric(0L)) + for( i in seq_len(niter) ){ + sample_loadings <- vector("list", n_tables) + cv_iter <- c() + + # Iterate over tables + for( table_i in seq_len(n_tables) ){ + # Perform gradient update for current table + res <- .gradient_update_joint_optspace( + table_i, observed_list, mask_list, U_shared, + S_shared, V_list, rho) + # Store table-specific updates + # Proposal for shared U + sample_loadings[[table_i]] <- res[["U_i"]] + # Table specific, updated V + V_list[[table_i]] <- res[["V_i"]] + + # Calculate cross-validation error for test set + cv_error <- .calculate_optspace_cv_error( + x_test[[table_i]], + V = res[["V_i"]], + S = res[["S_i"]] + ) + cv_iter <- c(cv_iter, cv_error) + } + + # Combine CV error. We will create a table of errors where each row is + # single iteration. + cv_iter <- data.frame(mean = mean(cv_iter), sd = sd(cv_iter)) + cv_errors <- rbind(cv_errors, cv_iter) + + # Update the shared sample factors (U_shared) + # - Each table proposes its own update of U (sample loadings) + # - Take the average across all tables to get a consensus + U_shared <- Reduce("+", sample_loadings) / n_tables + + # Update the shared singular values (S_shared) + # - First compute the average covariance of the updated U's + # - Perform SVD on X_U to extract the principal singular values + # - Normalize by Frobenius norm + X_U <- Reduce( + "+", lapply(sample_loadings, function(u) u %*% t(u))) / n_tables + svd_res <- svd(X_U) + S_shared <- svd_res[["d"]][seq_len(ropt)] |> diag() + S_shared <- S_shared / norm(S_shared, "F") + + # Align table-specific loadings with updated S_shared for consistent + # reconstruction + V_list <- lapply(V_list, function(V){ + t(S_shared %*% t(V)) + }) + } + + # Reverse the earlier rescaling so singular values match the scale + # of the original input data. + S_shared <- S_shared / rescale_param + + # Ensure components are ordered by decreasing singular value. + index_order <- order(diag(S_shared), decreasing = TRUE) + U_shared <- U_shared[, index_order, drop = FALSE] + S_shared <- S_shared[index_order, index_order, drop = FALSE] + # Combine feature loadings into one matrix + V_stacked <- do.call(rbind, V_list) + V_stacked <- V_stacked[, index_order, drop = FALSE] + + # Return U, S, and V with CV errors + res <- list( + X = U_shared, + S = S_shared, + Y = V_stacked, + cv_error = cv_errors + ) + return(res) +} + +# Initialize U, S and V by doing SVD with stacked data. +.initialize_joint_optspace <- function( + observed_stacked, observed_list, mask_stacked, mask_list, ropt, eps, + n_samples, n_features){ + # Run SVD. Our initial first guess are the loadings generated + # by the traditional SVD. + svd_res <- svd(observed_stacked) + U_shared <- svd_res[["u"]][, seq_len(ropt), drop = FALSE] + U_shared <- U_shared[ + , U_shared |> ncol() |> seq_len() |> rev(), drop = FALSE] + S_shared <- svd_res[["d"]][ seq_len(ropt) ] |> rev() |> diag() + V_shared <- svd_res[["v"]][, seq_len(ropt), drop = FALSE] + V_shared <- V_shared[ + , V_shared |> ncol() |> seq_len() |> rev(), drop = FALSE] + + # The shape and number of non-zero values + # can set the input parameters for the gradient + # decent. + U_shared <- U_shared * sqrt(n_samples) + V_shared <- V_shared * sqrt(sum(n_features)) + S_shared <- S_shared / eps + + # Generate the new singular values from + # the initialization of U and V + S_shared <- .aux_getoptS( + U_shared, V_shared, observed_stacked, mask_stacked) + + # Split feature loadings by table + ends <- cumsum(n_features) + starts <- c(1, head(ends, -1) + 1) + V_list <- mapply(function(s, e) { + V_shared[seq(s, e), , drop = FALSE] + }, starts, ends) + + res <- list( + U_shared = U_shared, + S_shared = S_shared, + V_list = V_list + ) + return(res) +} + +# Perform one gradient descent update for a single table. For each table, this +# produces proposal for U_shared and S_shared which are then later averaged. +.gradient_update_joint_optspace <- function( + table_i, observed_list, mask_list, U_shared, S_shared, V_list, + rho, step.size = 1e4, sign.correction = -1){ + if( !(.is_an_integer(step.size) && step.size > 0) ){ + stop("'step.size' must be a single positive integer.", call. = FALSE) + } + if( !(.is_an_integer(sign.correction) && sign.correction %in% c(-1, 1)) ){ + stop("'sign.correction' must be either -1 or 1.", call. = FALSE) + } + obs <- observed_list[[table_i]] + mask <- mask_list[[table_i]] + V_i <- V_list[[table_i]] + + # Compute gradient for table i + grad_res <- .aux_gradF_t( + U_shared, + V_i, + S_shared, + obs, + mask, + step.size, + rho + ) + U_update <- grad_res[["W"]] + V_update <- grad_res[["Z"]] + + # Line search for optimal step + step <- .aux_getoptT( + U_shared, + U_update, + V_i, + V_update, + S_shared, + obs, + mask, + step.size, + rho + ) + + # Update table-specific matrices + U_i <- U_shared - sign.correction * step * U_update + V_i <- V_i - sign.correction * step * V_update + + # Recompute singular values for this table + S_i <- .aux_getoptS( + U_shared, + V_i, + obs, + mask + ) + + # Return updated parameters + res <- list( + U_i = U_i, + S_i = S_i, + V_i = V_i + + ) + return(res) +} + +# This function calculates error between original table and reconstructed table. +.calculate_optspace_cv_error <- function(x_test, S, V, ...){ + # Get mask, i.e., info on which values are NA + mask <- x_test |> is.na() + n_obs <- sum(!mask) + # For the test tables, we double center them. The idea is that CV focuses + # on latent structure reconstruction and not mean shifts. + x_test <- x_test |> .apply_double_centering() + # Treat missing entries as zero during multiplication + x_filled <- x_test + x_filled[mask] <- 0 + + # Project test data onto feature loadings + U_test <- x_filled %*% V + U_test <- sweep(U_test, 2, diag(S), "/") + + # Reconstruct test matrix from projected U and singular values + reconstruct_test <- U_test %*% S %*% t(V) + + # Keep NA structure like masked arrays + reconstruct_test[mask] <- NA + + # Compute error on observed entries + obs_error <- x_test - reconstruct_test + obs_error[is.na(obs_error)] <- 0 + + # Scale error by the number of observed entries + error <- norm(obs_error, "F") / sqrt(n_obs) + + return(error) +} + +# This function applies double centering of the data, i.e., it centers columns +# and rows. +.apply_double_centering <- function(mat, ...){ + grand_mean <- mean(mat, na.rm = TRUE) + row_means <- rowMeans(mat) + mat <- sweep(mat, 1L, row_means, "-") + col_means <- colMeans(mat) + mat <- sweep(mat, 2L, col_means, "-") + # Add overall mean so that we do not subtract the data effectively 2 times. + # The result is a matrix that has row and column means in zero. + # mat <- mat + grand_mean + + # Add centering and scaling parameters to attributes + attributes(mat) <- c(attributes(mat), list( + center = list( + row = row_means, + col = col_means, + grand = grand_mean + ) + )) + return(mat) +} diff --git a/R/mia.R b/R/mia.R index fc62a795b..e22905ad0 100644 --- a/R/mia.R +++ b/R/mia.R @@ -38,7 +38,7 @@ NULL #' and 3 samples} #' \item{\code{\link{GlobalPatterns}}: A TreeSummarizedExperiment with 19216 #' features and 26 samples} -#' \item{\code{\link{ibdmdb_2omic_demo}}: A compact +#' \item{\code{\link{ibdmdb}}: A compact #' \code{MultiAssayExperiment} demo dataset with 2 experiments (MGX + MTX)} #' \item{\code{\link{HintikkaXOData}}: A MultiAssayExperiment with 3 #' experiments (microbiota, metabolites and biomarkers)} @@ -394,7 +394,7 @@ NULL #' A compact example derived from the Integrative Human Microbiome Project (iHMP) #' Inflammatory Bowel Disease (IBD) cohort. #' -#' The dataset \code{ibdmdb_2omic_demo} is a named list with: +#' The dataset \code{ibdmdb} is a named list with: #' \itemize{ #' \item \code{se_mgx}: metagenomic taxonomic profiles (MGX) #' \item \code{se_mtx}: metatranscriptomic taxonomic profiles (MTX) @@ -404,12 +404,12 @@ NULL #' #' These compact objects are intended for quick examples and vignettes. #' Load with: -#' \code{data("ibdmdb_2omic_demo")}. +#' \code{data("ibdmdb")}. #' -#' @name ibdmdb_2omic_demo +#' @name ibdmdb #' @docType data #' @keywords datasets -#' @usage data("ibdmdb_2omic_demo") +#' @usage data("ibdmdb") #' #' @format A \code{MultiAssayExperiment} with two experiments: #' \describe{ @@ -430,4 +430,4 @@ NULL #' diseases.} #' Nature 569, 655–662 (2019). #' \doi{10.1038/s41586-019-1237-9} -NULL \ No newline at end of file +NULL diff --git a/R/utils.R b/R/utils.R index 51ced6acf..ecf0907c2 100644 --- a/R/utils.R +++ b/R/utils.R @@ -18,6 +18,10 @@ .get_mat_from_sce <- scater:::.get_mat_from_sce .get_mat_for_reddim <- scater:::.get_mat_for_reddim +.aux_getoptS <- vegan:::.aux_getoptS +.aux_gradF_t <- vegan:::.aux_gradF_t +.aux_getoptT <- vegan:::.aux_getoptT +.aux_getoptS <- vegan:::.aux_getoptS ################################################################################ # integration with other packages @@ -350,6 +354,35 @@ return(x) } +.add_values_to_mae_metadata <- function( + x, names, values, metadata.name = "name", ...){ + # + if( !.is_a_string(metadata.name) ){ + stop("'metadata.name' must be a string.", call. = FALSE) + } + # + # Create a list and name elements + add_metadata <- list(values) + names(add_metadata) <- names + # Get old metadata + old_metadata <- metadata(x) + # Check if names match with elements that are already present + f <- names(old_metadata) %in% names(add_metadata) + if( any(f) ){ + warning( + "The following values are already present in `metadata` and will ", + "be overwritten: '", + paste(names(old_metadata)[f], collapse = "', '"), + "'. Consider using the '", metadata.name, + "' argument to specify alternative ", "names.", call. = FALSE) + } + # keep only unique values + add_metadata <- c( old_metadata[!f], add_metadata ) + # Add metadata to altExp or directly to x + metadata(x) <- add_metadata + return(x) +} + # This function can be used to add values to altExp .add_to_altExps <- function(x, values, name = names(values), ...){ # Check values @@ -711,4 +744,163 @@ # Add object to reducedDIm reducedDim(tse, name) <- res return(tse) -} \ No newline at end of file +} + +################################################################################ +# These following functions are for extracting more than one table from SCE or +# MAE. +# Select experiments from MAE +#' @importFrom MultiAssayExperiment experiments +.select_experiments <- function(mae, experiments) { + # Check that the value is correct + is_name <- is.character(experiments) && length(experiments) > 0 && + length(experiments) <= length(experiments(mae)) && + all(experiments %in% names(mae)) + is_index <- is.numeric(experiments) && all(experiments%%1==0) && + length(experiments) > 0 && + length(experiments) <= length(experiments(mae)) && + all(experiments>0 & experiments<=length(experiments(mae))) + if( !(is_name || is_index) ){ + stop("'experiments' must specify names or index of experiments of ", + "'x'", call. = FALSE) + } + # Subset experiments + mae <- mae[,, experiments] + # Check that all objects are SE + all_SE <- lapply(experiments(mae), function(x) + is(x, "SummarizedExperiment")) |> + unlist() |> all() + if( !all_SE ){ + stop("All experiments must be SummarizedExperiment objects.", + call. = FALSE) + } + return(mae) +} + +# Select optionally alternative experiments +#' @importFrom SingleCellExperiment altExps +.select_altexps <- function(mae, altexps = NULL, ...) { + # Check that value is correct + is_name <- is.character(altexps) && length(altexps) > 0 && + length(altexps) <= length(experiments(mae)) + is_index <- is.numeric(altexps) && all(altexps%%1==0) && + length(altexps) > 0 && + length(altexps) <= length(experiments(mae)) + if( !( is.null(altexps) || is_name || is_index) ){ + stop("'altexps' must be NULL or specify alternative experiments for ", + "each experiment.", call. = FALSE) + } + # If specified, select altExps from experiments + if( !is.null(altexps) ){ + if( !require("SingleCellExperiment") ){ + stop("To enable 'altexps' option, 'SingleCellExperiment' package ", + "must be installed.", call. = FALSE) + } + names(altexps) <- names(mae) + for ( exp in names(mae) ){ + # Get altExp if it is not NA, which disables altExp for single + # experiment + if( !is.na(altexps[[exp]]) ){ + if( !is(mae[[exp]], "SingleCellExperiment") ){ + stop("Experiment '", exp, "' must be SingleCellExperiment ", + "object.", call. = FALSE) + } + # Check that alExp can be found + is_name <- is.character(altexps[[exp]]) && + altexps[[exp]] %in% altExpNames(mae[[exp]]) + is_index <- is.numeric(altexps[[exp]]) && + all(altexps[[exp]]%%1==0) && + altexps[[exp]]>0 && + altexps[[exp]]<=length(alExps(mae[[exp]]) ) + if( !( is_name || is_index ) ){ + stop("'", altexps[[exp]], "' does not specify altExp from ", + "experiment '", exp, "'.", call. = FALSE) + } + mae[[exp]] <- altExp(mae[[exp]], altexps[[exp]]) + } + } + } + return(mae) +} + +# Select assays to be included in MOFA +#' @importFrom MultiAssayExperiment experiments +#' @importFrom SummarizedExperiment assay assays assayNames +.select_assays_from_mae <- function(mae, assay.types) { + # Check that value is correct + is_name <- is.character(assay.types) && length(assay.types) > 0 && + length(assay.types) <= length(experiments(mae)) + if( !is_name ){ + stop("'assay.type' must specify name of assays. The lenght must equal ", + "to 'experiments'.", call. = FALSE) + } + # Give corresponding experiment names to assay.types + names(assay.types) <- names(mae) + # For every experiment in MAE + for ( exp in names(mae) ){ + # Check that assay exists + if( !assay.types[[exp]] %in% assayNames(mae[[exp]]) ){ + stop("Cannot find assay '", assay.types[[exp]], "' from ", + "experiment '", exp, "'.", call. = FALSE) + } + # Keep only selected assay.type from a given experiment + assays(mae[[exp]]) <- assays(mae[[exp]])[ assay.types[[exp]] ] + } + return(mae) +} + +#' @importFrom SingleCellExperiment altExps +.select_experiments_from_tse <- function(tse, experiments) { + tse_list <- altExps(tse) + main_name <- "main" + + all_names <- make.unique(c(names(tse_list), main_name)) + main_name <- all_names[[length(all_names)]] + tse_list[[main_name]] <- tse + + if( !is.character(experiments) ){ + main_name <- length(tse_list) + } + experiments[ is.na(experiments) ] <- main_name + + # Check that the value is correct + is_name <- is.character(experiments) && length(experiments) > 0 && + length(experiments) <= length(tse_list) && + all(experiments %in% names(tse_list)) + is_index <- is.numeric(experiments) && all(experiments%%1==0) && + length(experiments) > 0 && + length(experiments) <= length(tse_list) && + all(experiments>0 & experiments<=length(tse_list)) + if( !(is_name || is_index) ){ + stop("'experiments' must specify names or index of alternative ", + "experiments of 'x'", call. = FALSE) + } + + tse_list <- tse_list[ experiments ] + return(tse_list) +} + +.select_assays_from_tse_list <- function(tse_list, assay.types) { + # Check that value is correct + is_name <- is.character(assay.types) && length(assay.types) > 0 && + length(assay.types) <= length(tse_list) + if( !is_name ){ + stop("'assay.type' must specify name of assays. The lenght must equal ", + "to 'experiments'.", call. = FALSE) + } + # Give corresponding experiment names to assay.types + names(assay.types) <- names(tse_list) + # For every experiment in MAE + for ( exp in names(tse_list) ){ + # Check that assay exists + if( !assay.types[[exp]] %in% assayNames(tse_list[[exp]]) ){ + stop("Cannot find assay '", assay.types[[exp]], "' from ", + "experiment '", exp, "'.", call. = FALSE) + } + # Keep only selected assay.type from a given experiment + assays(tse_list[[exp]]) <- assays(tse_list[[exp]])[ assay.types[[exp]] ] + } + return(tse_list) +} + +################################################################################ diff --git a/data/ibdmdb.rda b/data/ibdmdb.rda new file mode 100644 index 000000000..053a10773 Binary files /dev/null and b/data/ibdmdb.rda differ diff --git a/data/ibdmdb_2omic_demo.rda b/data/ibdmdb_2omic_demo.rda deleted file mode 100644 index 84770ee71..000000000 Binary files a/data/ibdmdb_2omic_demo.rda and /dev/null differ diff --git a/inst/scripts/prepare_ibdmdb_demo.R b/inst/scripts/prepare_ibdmdb.R similarity index 94% rename from inst/scripts/prepare_ibdmdb_demo.R rename to inst/scripts/prepare_ibdmdb.R index 92fd922b6..107830a70 100644 --- a/inst/scripts/prepare_ibdmdb_demo.R +++ b/inst/scripts/prepare_ibdmdb.R @@ -1,7 +1,7 @@ # Build compact demo objects (.rda) for IBDMDB examples/vignettes. # # Produces: -# data/ibdmdb_2omic_demo.rda (ibdmdb_2omic_demo) +# data/ibdmdb.rda (ibdmdb) # # Source raw inputs from inst/extdata and pre-process for speed/size. # @@ -15,7 +15,7 @@ message("== IBDMDB demo data preparation ==") # This script is for DEVELOPERS only. # It downloads large raw files into a local cache (NOT committed to git), -# then creates a compact demo dataset in data/ibdmdb_2omic_demo.rda. +# then creates a compact demo dataset in data/ibdmdb.rda. # It is NOT run during R CMD check. cache_dir <- file.path("tools", "cache", "mia_ibdmdb_cache") @@ -79,13 +79,13 @@ read_ibdmdb_tsv <- function(path) { if (length(comment_idx) == 0L) { stop("No commented header line found in: ", path) } - + header_line <- first[max(comment_idx)] header_line <- sub("^#\\s*", "", header_line) header_line <- sub("^\ufeff", "", header_line) header_vec <- strsplit(header_line, "\t", fixed = TRUE)[[1]] header_vec <- gsub('^"|"$', "", header_vec) - + dt <- data.table::fread( path, skip = length(comment_idx), @@ -152,34 +152,34 @@ make_SE <- function(mat, meta_df = NULL, assay_name = "counts") { colData = S4Vectors::DataFrame(row.names = colnames(mat)) )) } - + md <- as.data.frame(meta_df, stringsAsFactors = FALSE, check.names = FALSE) - + overlaps <- vapply( md, function(col) sum(as.character(col) %in% colnames(mat)), numeric(1) ) best <- names(overlaps)[which.max(overlaps)] - + if (length(best) == 0 || overlaps[[best]] == 0) { return(SummarizedExperiment::SummarizedExperiment( assays = setNames(list(mat), assay_name), colData = S4Vectors::DataFrame(row.names = colnames(mat)) )) } - + md_sub <- md[md[[best]] %in% colnames(mat), , drop = FALSE] md_sub <- md_sub[!duplicated(md_sub[[best]]), , drop = FALSE] - + rownames(md_sub) <- as.character(md_sub[[best]]) md_sub <- md_sub[colnames(mat), , drop = FALSE] - + if (anyDuplicated(rownames(md_sub))) { rownames(md_sub) <- make.unique(rownames(md_sub), sep = "_dup") } stopifnot(identical(rownames(md_sub), colnames(mat))) - + SummarizedExperiment::SummarizedExperiment( assays = setNames(list(mat), assay_name), colData = S4Vectors::DataFrame(md_sub) @@ -256,22 +256,21 @@ meta_df <- meta_full se_mgx <- make_SE(M_mgx, meta_df, assay_name = "mgx") se_mtx <- make_SE(M_mtx, meta_df, assay_name = "mtx") -mae2 <- MultiAssayExperiment::MultiAssayExperiment( +mae <- MultiAssayExperiment::MultiAssayExperiment( experiments = list(MGX = se_mgx, MTX = se_mtx) ) if (!is.null(SummarizedExperiment::colData(se_mgx))) { - MultiAssayExperiment::colData(mae2) <- SummarizedExperiment::colData(se_mgx) + MultiAssayExperiment::colData(mae) <- SummarizedExperiment::colData(se_mgx) } # Save only one dataset object -ibdmdb_2omic_demo <- mae2 - +ibdmdb <- mae save( - ibdmdb_2omic_demo, - file = file.path("data", "ibdmdb_2omic_demo.rda"), + ibdmdb, + file = file.path("data", "ibdmdb.rda"), compress = "xz" ) -message("Saved: data/ibdmdb_2omic_demo.rda") -message("== Done. Re-run devtools::document(); devtools::check(); BiocCheck::BiocCheck(). ==") \ No newline at end of file +message("Saved: data/ibdmdb.rda") +message("== Done. Re-run devtools::document(); devtools::check(); BiocCheck::BiocCheck(). ==") diff --git a/man/getJointRPCA.Rd b/man/getJointRPCA.Rd deleted file mode 100644 index ca1b1cd79..000000000 --- a/man/getJointRPCA.Rd +++ /dev/null @@ -1,68 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/getJointRPCA.R -\name{getJointRPCA} -\alias{getJointRPCA} -\title{Run Joint-RPCA and store embedding in reducedDim} -\arguments{ -\item{altexp}{Optional name of an alternative experiment. If supplied, -Joint-RPCA is run on \code{altExp(x, altexp)} instead of \code{x}.} - -\item{name}{Character scalar giving the name of the \code{reducedDim} slot -in which to store the joint sample embedding. Defaults to \code{"JointRPCA"}.} - -\item{x}{Input object: \code{MultiAssayExperiment}, \code{SummarizedExperiment} -(including \code{TreeSummarizedExperiment}), list of matrices, or single matrix.} - -\item{experiments}{Character vector of experiment names to extract when \code{x} -is a \code{MultiAssayExperiment} (i.e. \code{names(experiments(x))}). -If \code{NULL}, all experiments are used. - -For \code{MultiAssayExperiment} inputs, \strong{one assay per experiment} is -used: by default the first assay returned by -\code{assayNames()} (or index \code{1L} if unnamed). -The actually used assay names are recorded in \code{$assay_names_used} in -the result. If you need a different assay (e.g. \code{"relab"} instead of -\code{"counts"}), subset or reorder assays in \code{x} before calling -\code{jointRPCAuniversal()}.} - -\item{transform}{Character string specifying preprocessing applied to each -input table before ordination. Use \code{"rclr"} to apply the robust CLR -transform (via \code{decostand(method = "rclr")}) or \code{"none"} to -disable transformation (data are used as-is after masking non-finite values).} - -\item{optspace.tol}{Numeric tolerance passed to \code{optspace()}.} - -\item{center}{Logical; whether to center the reconstructed low-rank matrix -(double-centering) prior to SVD/PCA steps.} - -\item{scale}{Logical; whether to scale the reconstructed matrix prior to -SVD/PCA steps. Defaults to \code{FALSE}.} - -\item{...}{Additional arguments passed to \code{.joint_rpca()}.} -} -\value{ -The input object \code{x} with a new entry in -\code{reducedDim(x, name)} containing the Joint-RPCA sample embedding. -The full Joint-RPCA result (including distances, cross-validation -statistics and transformed tables) is stored in -\code{metadata(x)$JointRPCA[[name]]}. - -Output from \code{.joint_rpca()} with extra fields when \code{x} is a -\code{MultiAssayExperiment}: -\itemize{ -\item \code{$experiment_names}: character vector of experiments used. -\item \code{$assay_names_used}: named character vector giving, for each -experiment, the assay name that was used (typically the first in -\code{assayNames()}). -} -} -\description{ -Run Joint-RPCA and store embedding in reducedDim - -Universal Joint RPCA Wrapper -} -\details{ -Convenience wrapper that runs Joint Robust PCA on one or more compositional -tables and stores the resulting sample embedding in \code{reducedDim(x, name)}, -similar to \code{runMDS()} and \code{runPCA()}. -} diff --git a/man/getRPCA.Rd b/man/getRPCA.Rd new file mode 100644 index 000000000..17263e4d8 --- /dev/null +++ b/man/getRPCA.Rd @@ -0,0 +1,151 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/AllGenerics.R, R/getRPCA.R +\name{getRPCA} +\alias{getRPCA} +\alias{addRPCA} +\alias{getJointRPCA} +\alias{addJointRPCA} +\alias{getRPCA,SingleCellExperiment-method} +\alias{getRPCA,SummarizedExperiment-method} +\alias{addRPCA,SummarizedExperiment-method} +\alias{getJointRPCA,MultiAssayExperiment-method} +\alias{getJointRPCA,SingleCellExperiment-method} +\alias{addJointRPCA,SingleCellExperiment-method} +\alias{addJointRPCA,MultiAssayExperiment-method} +\title{Run (joint) robust principal component analysis (RPCA)} +\usage{ +getRPCA(x, ...) + +addRPCA(x, ...) + +getJointRPCA(x, ...) + +addJointRPCA(x, ...) + +\S4method{getRPCA}{SingleCellExperiment}(x, ...) + +\S4method{getRPCA}{SummarizedExperiment}(x, assay.type = "counts", ...) + +\S4method{addRPCA}{SummarizedExperiment}(x, name = "RPCA", ...) + +\S4method{getJointRPCA}{MultiAssayExperiment}(x, experiments, assay.types, ...) + +\S4method{getJointRPCA}{SingleCellExperiment}(x, experiments, assay.types, ...) + +\S4method{addJointRPCA}{SingleCellExperiment}(x, name = "JointRPCA", ...) + +\S4method{addJointRPCA}{MultiAssayExperiment}(x, name = "JointRPCA", ...) +} +\arguments{ +\item{x}{a \code{\link[SummarizedExperiment]{SummarizedExperiment}} object.} + +\item{...}{additional arguments: +\itemize{ +\item \code{ncomponents}:\code{Integer scalar}. The number of components +to estimate. (Default: \code{3}) +\item \code{max.iterations}:\code{Integer scalar}. The number of iterations +run in OptSpace algorithm. (Default: \code{3}) +\item \code{tolerance}:\code{Numeric scalar}. Accepted error between +lower rank representation obtained by OptSpace algorithm and original +table. (Default: \code{3}) +}} + +\item{assay.type}{\code{Character scalar}. Specifies the name of assay +used in calculation. (Default: \code{"counts"})} + +\item{name}{\code{Character scalar}. Name of results stored in \code{x}. +(Default: \code{"RPCA"} or \code{"JointRPCA"} depending on the method)} + +\item{experiments}{\code{Character vector} or \code{integer scalar}. Names +or indices of experiments selected from \code{x}.} + +\item{assay.types}{\code{Character vector}. Names assays selected from +\code{experiments}.} +} +\value{ +\code{matrix}, \code{TreeSummarizedExperiment} or \code{MultiAssayExperiment} +object. +} +\description{ +These functions implement robust principal component analysis (RPCA) +for single tables and joint RPCA for multiple tables. +\code{*RPCA} functions run RPCA for single table. \code{*JointRPCA} runs the +analysis jointly for multiple tables. \code{get*} functions return the +results of the analysis while \code{add*} adds them to the input object. +} +\details{ +These functions perform robust principal component analysis (RPCA) using a +low-rank matrix approximation followed by principal component analysis. +Missing values are handled using matrix completion based on the OptSpace +algorithm. + +\strong{Single-table RPCA} + +For a single assay, the workflow is: +\enumerate{ +\item Extract the selected assay matrix +\item Estimate a low-rank approximation of the matrix using the OptSpace +algorithm, which reconstructs missing values and reduces noise. +\item Apply double centering (row and column centering). +\item Apply PCA (via singular value decomposition) to the centered matrix. +\item Return the sample coordinates in PCA space together with additional +information such as loadings, explained variance, and pairwise distances. +} + +\strong{Joint RPCA} + +When multiple assays are provided, joint RPCA estimates a shared low-rank +representation across all tables. Each table may contain different features, +but they must share the same samples. + +The workflow is: +\enumerate{ +\item Extract the selected experiments and assays. +\item Estimate a joint low-rank representation using a joint OptSpace +algorithm that learns shared sample factors while allowing +table-specific feature loadings. +\item Apply double centering (row and column centering). +\item Perform PCA on the centered matrix. to obtain the final +sample coordinates. +\item Return the sample coordinates in PCA space together with additional +information such as loadings, explained variance, and pairwise distances. +} + +To assess generalization, the joint RPCA procedure optionally splits the +samples into training and test sets. The low-rank model is learned using the +training data and the test samples are projected into the resulting PCA +space. Reconstruction error for the test set is returned as a measure of +model fit. + +The returned object contains the PCA sample scores as the main result, +with additional information (e.g., rotation matrix, explained variance, +reconstructed matrix, and reconstruction error) stored in attributes. +} +\examples{ +data("ibdmdb") +mae <- ibdmdb + +# Apply data transformations +mae[[1]] <- transformAssay(mae[[1]], assay.type = "mgx", method = "rclr") +mae[[2]] <- transformAssay(mae[[2]], assay.type = "mtx", method = "rclr") + +# Run joint-RPCA +res <- getJointRPCA( + mae, + experiments = c(1, 2), + assay.types = c("rclr", "rclr") +) + +# Run RPCA for single experiment +res <- getRPCA(mae[[1]], assay.type = "rclr") + +} +\references{ +Martino, C. and Shenhav, L. et al. (2020) +Context-aware dimensionality reduction deconvolutes gut microbial community +dynamics. +\emph{Nat. Biotechnol.} doi:10.1038/s41587-020-0660-7 +} +\seealso{ +\code{\link[scater:runPCA]{scater::runPCA}} +} diff --git a/man/ibdmdb_2omic_demo.Rd b/man/ibdmdb.Rd similarity index 89% rename from man/ibdmdb_2omic_demo.Rd rename to man/ibdmdb.Rd index 6d490683b..d5a19a1e8 100644 --- a/man/ibdmdb_2omic_demo.Rd +++ b/man/ibdmdb.Rd @@ -1,8 +1,8 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/mia.R \docType{data} -\name{ibdmdb_2omic_demo} -\alias{ibdmdb_2omic_demo} +\name{ibdmdb} +\alias{ibdmdb} \title{IBDMDB 2-omic demo dataset (MGX + MTX)} \format{ A \code{MultiAssayExperiment} with two experiments: @@ -20,14 +20,14 @@ then subsets and filters the raw data to create a small example suitable for vignettes. } \usage{ -data("ibdmdb_2omic_demo") +data("ibdmdb") } \description{ A compact example derived from the Integrative Human Microbiome Project (iHMP) Inflammatory Bowel Disease (IBD) cohort. } \details{ -The dataset \code{ibdmdb_2omic_demo} is a named list with: +The dataset \code{ibdmdb} is a named list with: \itemize{ \item \code{se_mgx}: metagenomic taxonomic profiles (MGX) \item \code{se_mtx}: metatranscriptomic taxonomic profiles (MTX) @@ -37,7 +37,7 @@ containing both experiments These compact objects are intended for quick examples and vignettes. Load with: -\code{data("ibdmdb_2omic_demo")}. +\code{data("ibdmdb")}. } \references{ Lloyd-Price J, Arze C, Ananthakrishnan AN, et al. diff --git a/man/mia-datasets.Rd b/man/mia-datasets.Rd index 48dd28a1a..5ed336d7a 100644 --- a/man/mia-datasets.Rd +++ b/man/mia-datasets.Rd @@ -21,7 +21,7 @@ features and 280 samples} and 3 samples} \item{\code{\link{GlobalPatterns}}: A TreeSummarizedExperiment with 19216 features and 26 samples} -\item{\code{\link{ibdmdb_2omic_demo}}: A compact +\item{\code{\link{ibdmdb}}: A compact \code{MultiAssayExperiment} demo dataset with 2 experiments (MGX + MTX)} \item{\code{\link{HintikkaXOData}}: A MultiAssayExperiment with 3 experiments (microbiota, metabolites and biomarkers)} diff --git a/pkgdown/_pkgdown.yml b/pkgdown/_pkgdown.yml index f1f2887a6..d4f4fe208 100644 --- a/pkgdown/_pkgdown.yml +++ b/pkgdown/_pkgdown.yml @@ -43,7 +43,7 @@ reference: - peerj13075 - Tengeler2020 - Tito2024QMP - - ibdmdb_2omic_demo + - ibdmdb - title: Dataset description - contents: - getPrevalence diff --git a/tests/testthat/test-getJointRPCA.R b/tests/testthat/test-getJointRPCA.R deleted file mode 100644 index 07eea3a63..000000000 --- a/tests/testthat/test-getJointRPCA.R +++ /dev/null @@ -1,306 +0,0 @@ -test_that("getJointRPCA stores embedding in reducedDim and metadata", { - skip_if_not(requireNamespace("SingleCellExperiment", quietly = TRUE)) - - set.seed(123) - X <- abs(matrix(rnorm(20 * 10), nrow = 20, ncol = 10)) - rownames(X) <- paste0("f", 1:20) - colnames(X) <- paste0("s", 1:10) - - sce <- SingleCellExperiment::SingleCellExperiment( - assays = list(counts = X) - ) - - sce2 <- mia:::getJointRPCA( - sce, - name = "JointRPCA_test", - n.components = 2, - max.iterations = 2, - transform = "none", - n.test.samples = 3 - ) - - emb <- reducedDim(sce2, "JointRPCA_test") - expect_true(is.matrix(emb)) - expect_equal(nrow(emb), ncol(X)) - expect_equal(rownames(emb), colnames(X)) - - jr <- metadata(sce2)$JointRPCA[["JointRPCA_test"]] - expect_type(jr, "list") - expect_true(all(c("ord_res", "dist", "cv_stats", "rclr_tables") %in% names(jr))) -}) - -test_that("single-view .joint_rpca returns the expected structure", { - set.seed(123) - # Features x samples - X <- abs(matrix(rnorm(20 * 12), nrow = 20, ncol = 12)) - rownames(X) <- paste0("f", 1:20) - colnames(X) <- paste0("s", 1:12) - - fit <- mia:::.joint_rpca( - tables = list(assay1 = X), - n.components = 3, - max.iterations = 2, - transform = "none", - n.test.samples = 4 - ) - - # Top-level structure - expect_type(fit, "list") - expect_true(all(c("ord_res", "dist", "cv_stats", "rclr_tables") %in% names(fit))) - - # Ordination structure - OR <- fit$ord_res - expect_true(all(c("eigvals", "samples", "features", "proportion.explained") %in% names(OR))) - - # Samples: should include both train + projected test = all original samples - expect_equal(nrow(OR$samples), ncol(X)) - expect_equal(colnames(OR$samples), paste0("PC", 1:3)) - expect_true(all(colnames(OR$samples) %in% names(OR$proportion.explained))) - - # Features: now per-view list, even for single-view - expect_true(is.list(OR$features)) - expect_true("assay1" %in% names(OR$features)) - - F1 <- OR$features$assay1 - if (is.data.frame(F1)) F1 <- as.matrix(F1) - expect_equal(nrow(F1), nrow(X)) - expect_equal(colnames(F1), paste0("PC", 1:3)) - - # CV stats & dist - expect_true(is.data.frame(fit$cv_stats)) - expect_true(all(c("mean_CV", "std_CV", "run", "iteration") %in% names(fit$cv_stats))) - - expect_s3_class(fit$dist, "DistanceMatrix") - expect_true(is.matrix(fit$dist$data)) - expect_equal(nrow(fit$dist$data), ncol(X)) - expect_equal(colnames(fit$dist$data), colnames(X)) -}) - -test_that("multi-view .joint_rpca preserves per-view feature loadings", { - set.seed(42) - # Two views, same samples - S <- paste0("s", 1:10) - A <- abs(matrix(rnorm(25 * 10), 25, 10, - dimnames = list(paste0("a", 1:25), S))) - B <- abs(matrix(rnorm(30 * 10), 30, 10, - dimnames = list(paste0("b", 1:30), S))) - - fit <- mia:::.joint_rpca( - tables = list(MGX = A, MTX = B), - n.components = 2, - max.iterations = 2, - transform = "none", - n.test.samples = 3 - ) - - OR <- fit$ord_res - expect_true(is.list(OR$features)) - expect_true(all(c("MGX", "MTX") %in% names(OR$features))) - - # Check each view's loading matrix dimensions & rownames - MGXv <- OR$features$MGX; MTXv <- OR$features$MTX - if (is.data.frame(MGXv)) MGXv <- as.matrix(MGXv) - if (is.data.frame(MTXv)) MTXv <- as.matrix(MTXv) - - expect_equal(nrow(MGXv), nrow(A)) - expect_equal(nrow(MTXv), nrow(B)) - expect_equal(colnames(MGXv), paste0("PC", 1:2)) - expect_equal(colnames(MTXv), paste0("PC", 1:2)) - expect_true(all(rownames(MGXv) %in% rownames(A))) - expect_true(all(rownames(MTXv) %in% rownames(B))) - - # Samples include all training + projected test - expect_equal(nrow(OR$samples), length(S)) - expect_equal(colnames(OR$samples), paste0("PC", 1:2)) -}) - -test_that("unshared samples are dropped with a warning and alignment is correct", { - set.seed(7) - S1 <- paste0("s", 1:8) - S2 <- c(paste0("s", 3:10)) - A <- abs(matrix(rnorm(20 * length(S1)), 20, length(S1), - dimnames = list(paste0("a", 1:20), S1))) - B <- abs(matrix(rnorm(15 * length(S2)), 15, length(S2), - dimnames = list(paste0("b", 1:15), S2))) - - expect_warning( - fit <- mia:::.joint_rpca( - tables = list(MGX = A, MTX = B), - n.components = 2, - max.iterations = 2, - transform = "none", - n.test.samples = 2 - ), - regexp = "Removing.*sample\\(s\\).*overlap", - all = FALSE - ) - - OR <- fit$ord_res - expect_equal(nrow(OR$samples), 6) -}) - -test_that("projection of new samples via .transform appends rows and keeps component names", { - set.seed(99) - S_train <- paste0("s", 1:8) - featsA <- paste0("a", 1:18) - featsB <- paste0("b", 1:22) - - A <- abs(matrix(rnorm(length(featsA) * length(S_train)), length(featsA), length(S_train), - dimnames = list(featsA, S_train))) - B <- abs(matrix(rnorm(length(featsB) * length(S_train)), length(featsB), length(S_train), - dimnames = list(featsB, S_train))) - - fit <- mia:::.joint_rpca( - tables = list(MGX = A, MTX = B), - n.components = 3, - max.iterations = 2, - transform = "none", - n.test.samples = 3 - ) - - OR <- fit$ord_res - n_before <- nrow(OR$samples) - - # Create new samples (same features, new sample IDs) - S_new <- c("s9", "s10") - A_new <- abs(matrix(rnorm(length(featsA) * length(S_new)), length(featsA), length(S_new), - dimnames = list(featsA, S_new))) - B_new <- abs(matrix(rnorm(length(featsB) * length(S_new)), length(featsB), length(S_new), - dimnames = list(featsB, S_new))) - - # Project new samples - OR2 <- mia:::.transform( - ordination = OR, - tables = list(MGX = A_new, MTX = B_new), - apply.rclr = FALSE - ) - - expect_true(is.list(OR2)) - expect_true(all(c("samples", "features", "eigvals", "proportion.explained") %in% names(OR2))) - expect_equal(colnames(OR2$samples), colnames(OR$samples)) - expect_true(all(S_new %in% rownames(OR2$samples))) - expect_equal(nrow(OR2$samples), n_before + length(S_new)) -}) - -test_that("errors surface for duplicated sample IDs during preprocessing", { - set.seed(101) - X <- abs(matrix(rpois(24, lambda = 5), 6, 4)) - rownames(X) <- paste0("f", 1:6) - colnames(X) <- c("s1", "s1", "s2", "s3") - - expect_error( - mia:::.joint_rpca( - tables = list(assay1 = X), - n.components = 2, - max.iterations = 2, - transform = "none", - n.test.samples = 2 - ), - regexp = "duplicate sample \\(column\\) IDs", - ignore.case = TRUE - ) -}) - -test_that("jointRPCAuniversal works on MultiAssayExperiment", { - set.seed(2025) - - # Two views, same samples - S <- paste0("s", 1:8) - MGX_mat <- abs(matrix( - rnorm(15 * length(S)), - nrow = 15, ncol = length(S), - dimnames = list(paste0("mgx", 1:15), S) - )) - MTX_mat <- abs(matrix( - rnorm(10 * length(S)), - nrow = 10, ncol = length(S), - dimnames = list(paste0("mtx", 1:10), S) - )) - - se_mgx <- SummarizedExperiment::SummarizedExperiment( - assays = list(counts = MGX_mat) - ) - se_mtx <- SummarizedExperiment::SummarizedExperiment( - assays = list(counts = MTX_mat) - ) - - mae2 <- MultiAssayExperiment::MultiAssayExperiment( - experiments = list(MGX = se_mgx, MTX = se_mtx) - ) - - fit <- mia:::jointRPCAuniversal( - x = mae2, - experiments = c("MGX", "MTX"), - n.components = 2, - max.iterations = 2, - transform = "none", - n.test.samples = 3 - ) - - # Basic structure - expect_type(fit, "list") - expect_true(all(c("ord_res", "dist", "cv_stats", "rclr_tables") %in% names(fit))) - - # rclr_tables should be named by experiment - expect_true(is.list(fit$rclr_tables)) - expect_equal(names(fit$rclr_tables), c("MGX", "MTX")) - - # Per-view feature loadings should also carry MGX/MTX names - OR <- fit$ord_res - expect_true(is.list(OR$features)) - expect_true(all(c("MGX", "MTX") %in% names(OR$features))) - - MGXv <- OR$features$MGX - MTXv <- OR$features$MTX - if (is.data.frame(MGXv)) MGXv <- as.matrix(MGXv) - if (is.data.frame(MTXv)) MTXv <- as.matrix(MTXv) - - expect_equal(nrow(MGXv), nrow(MGX_mat)) - expect_equal(nrow(MTXv), nrow(MTX_mat)) - expect_equal(colnames(MGXv), paste0("PC", 1:2)) - expect_equal(colnames(MTXv), paste0("PC", 1:2)) -}) - -test_that("jointRPCAuniversal uses default assays per experiment in a MAE and records them", { - skip_if_not_installed("MultiAssayExperiment") - skip_if_not_installed("SummarizedExperiment") - - S <- paste0("s", 1:6) - A1_counts <- matrix(abs(rnorm(10 * 6)), nrow = 10, - dimnames = list(paste0("a", 1:10), S)) - A1_alt <- A1_counts * 2 - - A2_counts <- matrix(abs(rnorm(8 * 6)), nrow = 8, - dimnames = list(paste0("b", 1:8), S)) - A2_alt <- A2_counts * 3 - - se1 <- SummarizedExperiment::SummarizedExperiment( - assays = list(counts = A1_counts, alt = A1_alt) - ) - se2 <- SummarizedExperiment::SummarizedExperiment( - assays = list(counts = A2_counts, alt = A2_alt) - ) - - mae <- MultiAssayExperiment::MultiAssayExperiment( - experiments = list(MGX = se1, MTX = se2) - ) - - fit <- mia:::jointRPCAuniversal( - x = mae, - experiments = c("MGX", "MTX"), - n.components = 2, - max.iterations = 2, - transform = "none", - n.test.samples = 2 - ) - - # Basic structure - expect_true(is.list(fit)) - expect_true(all(c("MGX", "MTX") %in% names(fit$rclr_tables))) - - expect_true(all(c("experiment_names", "assay_names_used") %in% names(fit))) - expect_equal(fit$experiment_names, c("MGX", "MTX")) - expect_named(fit$assay_names_used, c("MGX", "MTX")) - expect_equal(fit$assay_names_used[["MGX"]], "counts") - expect_equal(fit$assay_names_used[["MTX"]], "counts") -}) \ No newline at end of file