diff --git a/R/RMT.R b/R/RMT.R index 36cda369..1f39be47 100644 --- a/R/RMT.R +++ b/R/RMT.R @@ -1,26 +1,26 @@ #' Denoising of Covariance matrix using Random Matrix Theory -#' +#' #' @name estRMT -#' +#' #' @details #' This method takes in data as a matrix object. It then #' fits a marchenko pastur density to eigenvalues of the correlation matrix. All #' eigenvalues above the cutoff are retained and ones below the cutoff are #' replaced such that the trace of the correlation matrix is 1 or non-significant -#' eigenvalues are deleted and diagonal of correlation matrix is changed to 1. +#' eigenvalues are deleted and diagonal of correlation matrix is changed to 1. #' Finally, correlation matrix is converted to covariance matrix. This function #' was taken and modified from the covmat package (https://github.com/cran/covmat) #' which has since been deprecated on CRAN. #' -#' @param R input matrix -#' @param Q ratio of rows/size. Can be supplied externally or fit using data -#' @param cutoff takes two values max/each. If cutoff is max, Q is fitted and +#' @param mat input matrix +#' @param dim_ratio ratio of rows/size. Can be supplied externally or fit using data +#' @param cutoff takes two values max/each. If cutoff is max, Q is fitted and #' cutoff for eigenvalues is calculated. If cutoff is each, Q is set to #' row/size. Individual cutoff for each eigenvalue is calculated and used -#' for filteration. -#' @param eigenTreat takes 2 values, average/delete. If average then the noisy +#' for filteration. +#' @param eigenTreat takes 2 values, average/delete. If average then the noisy #' eigenvalues are averged and each value is replaced by average. If delete -#' then noisy eigenvalues are ignored and the diagonal entries of the +#' then noisy eigenvalues are ignored and the diagonal entries of the #' correlation matrix are replaced with 1 to make the matrix psd. #' @param numEig number of eigenvalues that are known for variance calculation. #' Default is set to 1. If numEig = 0 then variance is assumed to be 1. @@ -28,106 +28,119 @@ #' @importFrom stats optim cov #' #' @return A denoised RMT object -#' -#' @examples +#' +#' @examples #' rand_cor_mat <- cor(matrix(rnorm(100), nrow = 10)) #' denoised_rand_cor_mat <- estRMT(rand_cor_mat)$cov -#' +#' #' @author Rohit Arora -#' +#' #' @export -#' -estRMT <- function(R, Q = NA, cutoff = c("max", "each"), - eigenTreat = c("average", "delete") , numEig = 1) { - - .data <- as.matrix(R) - T <- nrow(.data) - M <- ncol(.data) - if (T < M) stop("Does not work when nrow < ncol") - - if(!is.na(Q)) if(Q < 1) stop("Does not work for Q<1") - cutoff <- cutoff[1]; if(!cutoff %in% c("max", "each")) stop("Invalid cutoff") - if(cutoff == "each") Q <- T/M - - eigenTreat <- eigenTreat[1]; - if(!eigenTreat %in% c("average", "delete")) stop("Invalid eigenTreat option") - - if (numEig < 0) stop("Number of eigenvalues must be non-negative") - - #eigenvalues can be negative. To avoid this e need a positive-definite matrix - S <- cov(.data); S <- as.matrix(Matrix::nearPD(S)$mat) - D <- diag(diag(S)); C <- Matrix::cov2cor(S); - +estRMT <- function(mat, dim_ratio = NA, cutoff = c("max", "each"), eigenTreat = c("average", "delete"), numEig = 1) { + .data <- as.matrix(mat) + n_row <- nrow(.data) + n_col <- ncol(.data) + cutoff <- match.arg(cutoff) + eigenTreat <- match.arg(eigenTreat) + + if (cutoff == "each") { + dim_ratio <- n_row / n_col + } + + stopifnot("Does not work when nrow < ncol" = n_row >= n_col) + if (!is.na(dim_ratio)) { + stopifnot("Does not work for dim_ration < 1" = dim_ratio >= 1) + } + + stopifnot("Number of eigenvalues must be non-negative" = numEig > 0) + + #eigenvalues can be negative. To avoid this e need a positive-definite matrix + S <- cov(.data) + S <- as.matrix(Matrix::nearPD(S)$mat) + D <- diag(diag(S)) + C <- Matrix::cov2cor(S) + # Marchenko Pastur density is defined for eigenvalues of correlation matrix - eigen.C <- eigen(C,symmetric=T) - lambdas <- eigen.C$values; sigma.sq <- mean(lambdas) - - sigma.sq <- 1 - sum(head(lambdas,numEig))/M - - #minimize log-likelihood. + eigen.C <- eigen(C, symmetric = n_row) + lambdas <- eigen.C$values + sigma.sq <- mean(lambdas) + + sigma.sq <- 1 - sum(head(lambdas, numEig)) / n_col + + #minimize log-likelihood. loglik.marpas <- function(theta, sigma.sq) { - Q <- theta - val <- sapply(lambdas, function(x) RMTstat::dmp(x,svr = Q, var=sigma.sq)) + dim_ratio <- theta + val <- sapply(lambdas, function(x) RMTstat::dmp(x, svr = dim_ratio, var = sigma.sq)) val <- val[val > 0] ifelse(is.infinite(-sum(log(val))), .Machine$double.xmax, -sum(log(val))) } - - if( is.na(Q) && cutoff != "each") { + + if (is.na(dim_ratio) && cutoff != "each") { lb <- 1 - ub <- max(T/M,5) + ub <- max(n_row / n_col, 5) starts <- seq(lb, ub, length.out = 50) # this would be a logical place to use BiocParallel::bpapply - fit.marpas <- do.call(rbind, - lapply(starts, - function(start) { - optim(par=start, - fn=loglik.marpas, - method="L-BFGS-B", - lower=lb, - upper=ub, - sigma.sq=sigma.sq) - })) - idx <- grep("CONVERGENCE",unlist(fit.marpas[,"message"])) - vals <- fit.marpas[idx,c("par","value")] # wtf is going on here - Q <- unlist(vals[which.min(vals[,"value"]),"par"]) + fit.marpas <- do.call( + rbind, + lapply( + starts, + function(start) { + optim(par = start, fn = loglik.marpas, method = "L-BFGS-B", lower = lb, upper = ub, sigma.sq = sigma.sq) + } + ) + ) + idx <- grep("CONVERGENCE", unlist(fit.marpas[, "message"])) + vals <- fit.marpas[idx, c("par", "value")] # wtf is going on here + dim_ratio <- unlist(vals[which.min(vals[, "value"]), "par"]) } - - lambda.max <- RMTstat::qmp(1, svr=Q, var = sigma.sq) + + lambda.max <- RMTstat::qmp(1, svr = dim_ratio, var = sigma.sq) # now that we have a fit. lets denoise eigenvalues below the cutoff - - if(cutoff == "max") { + + if (cutoff == "max") { idx <- which(lambdas > lambda.max) - } else if(cutoff == "each") { + } else if (cutoff == "each") { cutoff.each <- sapply(2:length(lambdas), function(i) { - eigr <- lambdas[i:M] - mean(eigr)*(1 + (M - i + 1)/T + 2*sqrt((M - i + 1)/T)) + eigr <- lambdas[i:n_col] + mean(eigr) * (1 + (n_col - i + 1) / n_row + 2 * sqrt((n_col - i + 1) / n_row)) }) - idx <- c(1, 1 + which(lambdas[-1] > cutoff.each)) + idx <- c(1, 1 + which(lambdas[-1] > cutoff.each)) + } + + if (length(idx) == 0) { + return(S) + } + + val <- eigen.C$values[idx] + vec <- eigen.C$vectors[, idx, drop = FALSE] + sum <- 0 + for (i in seq_len(ncol(vec))) { + sum <- sum + val[i] * vec[, i] %*% t(vec[, i]) } - - if (length(idx) == 0) return(S) - - val <- eigen.C$values[idx]; vec <- eigen.C$vectors[,idx,drop=FALSE] - sum <- 0; for (i in 1:ncol(vec)) sum <- sum + val[i]*vec[,i] %*% t(vec[,i]) - - # trace of correlation matrix is 1. Use this to determine all the remaining - # eigenvalues - + + # trace of correlation matrix is 1. Use this to determine all the remaining eigenvalues + lambdas.cleaned <- c() clean.C <- if (eigenTreat == "average") { - lambdas.cleaned <- c(val, rep(1,M)) - sum + sum(eigen.C$values[-idx])/M * diag(rep(1,M)) + lambdas.cleaned <- c(val, rep(1, n_col)) + sum + sum(eigen.C$values[-idx]) / n_col * diag(rep(1, n_col)) } else if (eigenTreat == "delete") { - lambdas.cleaned <- c(val, rep(0,M)) + lambdas.cleaned <- c(val, rep(0, n_col)) diag(sum) <- 1 sum } - + # convert correlation to covariance matrix and return clean.S <- D^0.5 %*% clean.C %*% D^0.5 - fit <- list(cov = clean.S, Q = Q, var = sigma.sq, eigVals = lambdas, - eigVals.cleaned = lambdas.cleaned, lambdascutoff = lambda.max) - + fit <- list( + cov = clean.S, + dim_ratio = dim_ratio, + var = sigma.sq, + eigVals = lambdas, + eigVals.cleaned = lambdas.cleaned, + lambdascutoff = lambda.max + ) + class(fit) <- "RMT" fit } @@ -146,7 +159,7 @@ estRMT <- function(R, Q = NA, cutoff = c("max", "each"), #' @param assay What assay type this is ("rna", "atac") #' #' @return A denoised correlation matrix object for plotting with plotCorMatrix -#' +#' #' @import SummarizedExperiment #' @import GenomicRanges #' @@ -156,10 +169,16 @@ estRMT <- function(R, Q = NA, cutoff = c("max", "each"), #' data("k562_scrna_chr14", package = "compartmap") #' denoised_cor_mat <- getDenoisedCorMatrix(k562_scrna_chr14, genome = "hg19", assay = "rna") -getDenoisedCorMatrix <- function(obj, res = 1e6, chr = "chr14", - genome = c("hg19", "hg38", "mm9", "mm10"), - iter = 2, targets = NULL, prior.means = NULL, - assay = c("rna", "atac", "array")) { +getDenoisedCorMatrix <- function( + obj, + res = 1e6, + chr = "chr14", + genome = c("hg19", "hg38", "mm9", "mm10"), + iter = 2, + targets = NULL, + prior.means = NULL, + assay = c("rna", "atac", "array") +) { ## this is a wrapper to give back a denoised correlation matrix to plot #match the assay args assay <- match.arg(assay) @@ -172,20 +191,27 @@ getDenoisedCorMatrix <- function(obj, res = 1e6, chr = "chr14", #subset to selected chromosome(s) obj <- keepSeqlevels(obj, chr, pruning.mode = "coarse") - + #get the prior means if (is.null(prior.means)) { - prior.means <- getGlobalMeans(obj=obj, targets=targets, - assay=assay) + prior.means <- getGlobalMeans(obj = obj, targets = targets, assay = assay) } else { message("Assuming the prior means passed were derived from the full sample set.") } - + #shrink bins message("Shrinking bins with the JSE.") - bin.mat <- shrinkBins(x = obj, original.x = obj, prior.means = prior.means, - chr = chr, res = res, targets = targets, - jse = TRUE, assay = assay, genome = genome) + bin.mat <- shrinkBins( + x = obj, + original.x = obj, + prior.means = prior.means, + chr = chr, + res = res, + targets = targets, + jse = TRUE, + assay = assay, + genome = genome + ) #get the raw correlation matrix cor.mat <- getCorMatrix(binmat = bin.mat, squeeze = FALSE) #denoise with RMT @@ -198,13 +224,12 @@ getDenoisedCorMatrix <- function(obj, res = 1e6, chr = "chr14", cor.mat.denoise <- compartmap::estRMT(cor.mat.denoise)$cov } } - + #rescale - cor.mat.denoise <- scales::rescale(cor.mat.denoise, - to = c(0,1)) + cor.mat.denoise <- scales::rescale(cor.mat.denoise, to = c(0, 1)) #tidy up colnames(cor.mat.denoise) <- rownames(cor.mat.denoise) <- as.character(granges(cor.mat$gr.cor)) - + return(cor.mat.denoise) } @@ -219,10 +244,10 @@ getDenoisedCorMatrix <- function(obj, res = 1e6, chr = "chr14", #' @param lowertri Whether to keep the lower triangle of the matrix #' #' @return Either a ggplot object or plot -#' +#' #' @importFrom rlang .data #' @importFrom ggplot2 ggplot aes geom_raster scale_fill_gradient2 theme_minimal theme element_blank -#' +#' #' @export #' #' @examples @@ -230,11 +255,13 @@ getDenoisedCorMatrix <- function(obj, res = 1e6, chr = "chr14", #' set.seed(1000) #' my_plot <- plotCorMatrix(dummy, return.plot.obj = TRUE) -plotCorMatrix <- function(denoised.cor.mat, - midpoint = 0.3, - return.plot.obj = FALSE, - uppertri = FALSE, - lowertri = FALSE) { +plotCorMatrix <- function( + denoised.cor.mat, + midpoint = 0.3, + return.plot.obj = FALSE, + uppertri = FALSE, + lowertri = FALSE +) { ## upper tri if (uppertri) { denoised.cor.mat[upper.tri(denoised.cor.mat)] <- 0 @@ -248,16 +275,15 @@ plotCorMatrix <- function(denoised.cor.mat, ## plot p <- ggplot(cor.mat.melt, aes(x = .data$Var2, y = .data$Var1, fill = .data$value)) + geom_raster() + - scale_fill_gradient2(low = "white", mid = "white", high = "red3", midpoint = {{ midpoint }}, - name = "Correlation") + + scale_fill_gradient2(low = "white", mid = "white", high = "red3", midpoint = {{ midpoint }}, name = "Correlation") + theme_minimal() + theme( axis.title = element_blank(), axis.text = element_blank(), panel.grid = element_blank(), - plot.margin= grid::unit(c(0, 0, 0, 0), "in") - ) - + plot.margin = grid::unit(c(0, 0, 0, 0), "in") + ) + if (return.plot.obj) { return(p) } else { diff --git a/man/estRMT.Rd b/man/estRMT.Rd index b843d77b..0a211b0f 100644 --- a/man/estRMT.Rd +++ b/man/estRMT.Rd @@ -5,17 +5,17 @@ \title{Denoising of Covariance matrix using Random Matrix Theory} \usage{ estRMT( - R, - Q = NA, + mat, + dim_ratio = NA, cutoff = c("max", "each"), eigenTreat = c("average", "delete"), numEig = 1 ) } \arguments{ -\item{R}{input matrix} +\item{mat}{input matrix} -\item{Q}{ratio of rows/size. Can be supplied externally or fit using data} +\item{dim_ratio}{ratio of rows/size. Can be supplied externally or fit using data} \item{cutoff}{takes two values max/each. If cutoff is max, Q is fitted and cutoff for eigenvalues is calculated. If cutoff is each, Q is set to @@ -49,7 +49,7 @@ which has since been deprecated on CRAN. \examples{ rand_cor_mat <- cor(matrix(rnorm(100), nrow = 10)) denoised_rand_cor_mat <- estRMT(rand_cor_mat)$cov - + } \author{ Rohit Arora