diff --git a/R/epidish.R b/R/epidish.R index cf76b2e..9857c14 100755 --- a/R/epidish.R +++ b/R/epidish.R @@ -2,12 +2,8 @@ #' #' @param beta_matrix a beta matrix with CpGs in rows and samples in columns #' @param mode Choice of a reference-based method ('RPC','CBS','CP') -#' @param reference A matrix of reference 'centroids', i.e. representative molecular profiles, -#' for a number of cell subtypes. rows label molecular features (e.g. CpGs,...) -#' and columns label the cell-type. IDs need to be provided as rownames and -#' colnames, respectively. Missing value is not allowed, and all values in -#' this matrix should be positive or zero. For DNAm data, values should be -#' beta-values. +#' @param reference Either a string (one of 'blood', 'breast', 'epithelial') or a data.frame of reference CpGs, i.e. representative sites, +#' for a number of cell subtypes. If a matrix is provided, it will be used directly as the signature matrix. #' @param maxit Only used in RPC mode, the limit of the number of IWLS iterations #' @param nu.v Only used in CBS mode. It is a vector of several candidate nu values. nu is #' parameter needed for nu-classification, nu-regression, and @@ -51,17 +47,35 @@ run_epidish <- function(beta_matrix, mode <- mode[1] message(paste0(mode, " was chosen as default for \"mode\"")) } - if (length(reference) > 1) { + # If reference is a character, use built-in, else use as matrix + if (is.character(reference) && length(reference) > 1) { reference <- reference[1] message(paste0(reference, " was chosen as default for \"reference\"")) } message(paste0("Starting EpiDISH deconvolution with mode ", mode, " ...")) - ref_mat <- switch(reference, - 'blood' = EpiDISH::centDHSbloodDMC.m, - 'breast' = EpiDISH::centEpiFibFatIC.m, - 'epithelial' = EpiDISH::centEpiFibIC.m - ) + if (is.character(reference)) { + ref_mat <- switch(reference, + 'blood' = EpiDISH::centDHSbloodDMC.m, + 'breast' = EpiDISH::centEpiFibFatIC.m, + 'epithelial' = EpiDISH::centEpiFibIC.m, + stop("Unknown reference type: ", reference) + ) + } else if (is.data.frame(reference)) { + + # check if 'CpGs' column exists + if(!'CpGs' %in% colnames(reference)){ + stop("No CpG column in custom reference.") + } + + reference_cpgs <- reference[['CpGs']] + ref_mat <- reference |> dplyr::select(-CpGs) |> as.matrix() + rownames(ref_mat) <- reference_cpgs + + } else { + stop("reference must be either a character string or a data.frame") + } + if (!is.null(cpg_subset)) { overlap <- intersect(cpg_subset, rownames(ref_mat)) missing <- setdiff(cpg_subset, rownames(ref_mat)) diff --git a/R/methatlas.R b/R/methatlas.R index b47b329..e42dfae 100644 --- a/R/methatlas.R +++ b/R/methatlas.R @@ -1,8 +1,9 @@ #' run MethAtlas #' #' @param beta_matrix a beta matrix with CpGs in rows and samples in columns -#' @param reference_atlas Path to a csv file that saves a reference matrix with CpGs as rows and cell types as columns. -#' The default (tissue-wide) reference file is stored in 'inst/reference_atlas.csv'. +#' @param reference Either a path to a csv file that saves a reference matrix with CpGs as rows and cell types as columns, +#' or a data.frame of reference CpGs with a 'CpGs' column and cell types as other columns. +#' The default (tissue-wide) reference file is stored in 'inst/reference_atlas.csv'. #' @param temp_dir Path to directory where the beta matrix will be saved as a csv file. #' @param out_dir Path to output directory. Output will be a csv file and a png representing the cell type fractions. #' @param use_epic_reference The MethAtlas has a whole-tissue reference or a immunecell-specific reference that is optimized for EPIC arrays (which is a subset of the whole-tissue reference) @@ -10,7 +11,7 @@ #' #' @export #' -run_methatlas <- function(beta_matrix, reference_atlas = system.file("reference_atlas.csv", package = "deconvMe"), temp_dir = NULL, out_dir = NULL, use_epic_reference=FALSE, cpg_subset = NULL){ +run_methatlas <- function(beta_matrix, reference = system.file("reference_atlas.csv", package = "deconvMe"), temp_dir = NULL, out_dir = NULL, use_epic_reference=FALSE, cpg_subset = NULL){ # check if python is installed, else install init_python() @@ -34,12 +35,28 @@ run_methatlas <- function(beta_matrix, reference_atlas = system.file("reference_ beta_path = paste0(tmp_dir, "/beta.csv") write.csv(beta_matrix, beta_path) - # subset reference if applicable - if(use_epic_reference){ - reference_atlas <- system.file("reference_atlas_epic.csv", package = "deconvMe") + # Handle reference parameter + if (is.character(reference)) { + # If it's a character, treat as file path and check if it exists + if(!file.exists(reference)) { + stop("Reference file does not exist: ", reference) + } + + # If it's epic reference, use the epic reference file + if(use_epic_reference){ + reference <- system.file("reference_atlas_epic.csv", package = "deconvMe") + } + ref_df <- read.csv(reference, check.names = FALSE) + } else if (is.data.frame(reference)) { + # If it's a data.frame, use it directly + if (!'CpGs' %in% colnames(reference)) { + stop("No 'CpGs' column in custom reference data.frame.") + } + ref_df <- reference + } else { + stop("reference must be either a character string (file path) or a data.frame") } - ref_df <- read.csv(reference_atlas, check.names = FALSE) if (!is.null(cpg_subset)) { overlap <- intersect(cpg_subset, ref_df$CpGs) missing <- setdiff(cpg_subset, ref_df$CpGs) @@ -54,12 +71,16 @@ run_methatlas <- function(beta_matrix, reference_atlas = system.file("reference_ } ref_df <- ref_df[ref_df$CpGs %in% overlap, , drop = FALSE] # Write the subsetted reference to a temp file - reference_atlas <- paste0(tmp_dir, "/reference_atlas_subset.csv") - write.csv(ref_df, reference_atlas, row.names = FALSE) + reference_path <- paste0(tmp_dir, "/reference_atlas_subset.csv") + write.csv(ref_df, reference_path, row.names = FALSE) + }else{ + # Write the reference to a temp file + reference_path <- paste0(tmp_dir, "/reference_atlas_temp.csv") + write.csv(ref_df, reference_path, row.names = FALSE) } # run meth_atlas - system(paste("python", system.file("deconvolve.py", package = "deconvMe")," -a", reference_atlas, beta_path, "--out", out_dir)) + system(paste("python", system.file("deconvolve.py", package = "deconvMe")," -a", reference_path, beta_path, "--out", out_dir)) # read the results to provide as data frame as.matrix(t(read.csv(paste0(out_dir, "/beta_deconv_output.csv"), @@ -75,6 +96,6 @@ run_methatlas <- function(beta_matrix, reference_atlas = system.file("reference_ #' @export get_methatlas_signature_matrix <- function() { reference_atlas = system.file("reference_atlas.csv", package = "deconvMe") - read.csv(reference_atlas, check.names = FALSE) + unique(read.csv(reference_atlas, check.names = FALSE)) } \ No newline at end of file diff --git a/R/methylResolver.R b/R/methylResolver.R index 80c45b8..394622f 100644 --- a/R/methylResolver.R +++ b/R/methylResolver.R @@ -8,11 +8,12 @@ #' @param purityModel Random Forest model to predict mixture purity (unknown content) which allows the calculation of absolute cell type fractions. Required if absolute is TRUE. Default is our RF model trained on the consensus purity estimate (CPE) using TCGA data. #' @param seed fixed seed to account for RNG influences #' @param cpg_subset Optional character vector of CpGs to subset the signature matrix. Default: NULL (use all CpGs in the signature). +#' @param reference Either the built-in MethylResolver signature matrix or a data.frame of reference CpGs with a 'CpGs' column and cell types as other columns. #' #' @export #' run_methylresolver <- function(beta_matrix, doPar = F, numCores = 1, alpha = seq(0.5,0.9,by = 0.05), - absolute = TRUE, purityModel = MethylResolver::RFmodel, seed = 1, cpg_subset = NULL){ + absolute = TRUE, purityModel = MethylResolver::RFmodel, seed = 1, cpg_subset = NULL, reference = NULL){ set.seed(seed) @@ -23,7 +24,23 @@ run_methylresolver <- function(beta_matrix, doPar = F, numCores = 1, alpha = seq immediate. = TRUE) } - sig <- MethylResolver::MethylSig + # Handle reference parameter + if (is.null(reference)) { + sig <- MethylResolver::MethylSig + } else if (is.data.frame(reference)) { + if (!'CpGs' %in% colnames(reference)) { + stop("No 'CpGs' column in custom reference data.frame.") + } + sig <- reference[, -which(colnames(reference) == 'CpGs')] + rownames(sig) <- reference$CpGs + colnames(sig) <- gsub(pattern = '-', replacement = '_', x = colnames(sig)) + + warning('When a external signature matrix is provided, MethylResolver can no longer estimate tumor purity and absolute will be automatically set to FALSE.') + absolute <- FALSE + } else { + stop("reference must be either NULL (use built-in) or a data.frame") + } + if (!is.null(cpg_subset)) { overlap <- intersect(cpg_subset, rownames(sig)) missing <- setdiff(cpg_subset, rownames(sig)) @@ -48,10 +65,15 @@ run_methylresolver <- function(beta_matrix, doPar = F, numCores = 1, alpha = seq absolute = absolute, purityModel = purityModel) - result_metrics <- result_methylresolver[,1:4] - result_fractions <- result_methylresolver[,5:15] - result_absolute <- result_methylresolver[,16:26] - result_purity <- result_methylresolver[,27] + result_metrics <- result_methylresolver[,c('RMSE1', 'R1', 'RMSE2', 'R2')] + result_fractions <- result_methylresolver[,colnames(sig)] + if(absolute){ + result_absolute <- result_methylresolver[,paste0('abs_',colnames(sig))] + result_purity <- result_methylresolver[['Purity']] + }else{ + result_absolute <- NULL + result_purity <- NULL + } return(list(result_metrics=result_metrics, result_fractions=result_fractions, diff --git a/tests/testthat/test_signature_matrix.R b/tests/testthat/test_signature_matrix.R new file mode 100644 index 0000000..b2e68df --- /dev/null +++ b/tests/testthat/test_signature_matrix.R @@ -0,0 +1,69 @@ +suppressMessages(library(deconvMe)) +options(matrixStats.useNames.NA = "deprecated") + +methyl_set <- minfiData::MsetEx +ratio_set <- minfi::ratioConvert(methyl_set) +beta_matrix <- minfi::getBeta(ratio_set) + + +test_that("Houseman, EpiDISH, MethAtlas, and MethylResolver signatures have same design",{ + houseman_sig <- deconvMe::get_houseman_signature_matrix() + epidish_sig <- deconvMe::get_epidish_signature_matrix("blood") + methatlas_sig <- deconvMe::get_methatlas_signature_matrix() + methylresolver_sig <- deconvMe::get_methylresolver_signature_matrix() + + expect_true('CpGs' %in% colnames(houseman_sig)) + expect_true('CpGs' %in% colnames(epidish_sig)) + expect_true('CpGs' %in% colnames(methatlas_sig)) + expect_true('CpGs' %in% colnames(methylresolver_sig)) + + expect_true(is.data.frame(houseman_sig)) + expect_true(is.data.frame(epidish_sig)) + expect_true(is.data.frame(methatlas_sig)) + expect_true(is.data.frame(methylresolver_sig)) + + expect_true(nrow(houseman_sig) > 0) + expect_true(nrow(epidish_sig) > 0) + expect_true(nrow(methatlas_sig) > 0) + expect_true(nrow(methylresolver_sig) > 0) +}) + +test_that("run_epidish works with Houseman signature as external reference", { + houseman_sig <- deconvMe::get_houseman_signature_matrix() + res <- suppressWarnings(deconvMe::run_epidish(beta_matrix, reference = houseman_sig)) + + expect_true(is.list(res)) + expect_true("estF" %in% names(res)) + expect_true(nrow(res$estF) > 0) + expect_true(ncol(res$estF) > 0) +}) + +test_that("run_methatlas works with Houseman signature as external reference", { + houseman_sig <- deconvMe::get_houseman_signature_matrix() + res <- suppressWarnings(deconvMe::run_methatlas(beta_matrix, reference = houseman_sig)) + + expect_true(is.matrix(res) || is.data.frame(res)) + expect_true(nrow(res) > 0) + expect_true(ncol(res) > 0) +}) + +test_that("run_methylresolver works with Houseman signature as external reference", { + houseman_sig <- deconvMe::get_houseman_signature_matrix() + res <- suppressWarnings(deconvMe::run_methylresolver(beta_matrix, reference = houseman_sig, alpha = 1)) + + expect_true(is.list(res)) + expect_true("result_fractions" %in% names(res)) + expect_true(nrow(res$result_fractions) > 0) + expect_true(ncol(res$result_fractions) > 0) +}) + +test_that("run_methylresolver works with Methatlas signature as external reference", { + methatlas_sig <- deconvMe::get_methatlas_signature_matrix() + res <- suppressWarnings(deconvMe::run_methylresolver(beta_matrix, reference = methatlas_sig, alpha = 1)) + + expect_true(is.list(res)) + expect_true("result_fractions" %in% names(res)) + expect_true(nrow(res$result_fractions) > 0) + expect_true(ncol(res$result_fractions) > 0) +}) + diff --git a/vignettes/signatures.Rmd b/vignettes/signatures.Rmd index d32b614..8b4cc15 100644 --- a/vignettes/signatures.Rmd +++ b/vignettes/signatures.Rmd @@ -1,150 +1,121 @@ --- -title: "Signature Matrices and Custom CpGs in deconvMe" +title: "Signature Matrices and Cross-Signature Deconvolution in deconvMe" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Signature Matrices and Custom CpGs in deconvMe} + %\VignetteIndexEntry{Signature Matrices and Cross-Signature Deconvolution in deconvMe} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- -# Background: Reference-Based vs Reference-Free Deconvolution +# Signature Matrices in deconvMe -Cell-type deconvolution methods can be broadly categorized into two main approaches: +deconvMe implements reference-based deconvolution methods that rely on *signature matrices* (reference matrices) to estimate cell-type proportions. A signature matrix is a table where each row corresponds to a CpG site and each column corresponds to a cell type, with entries representing expected methylation values for each CpG in each pure cell type. -## Reference-Based Methods -Reference-based methods rely on *signature matrices* (also called reference matrices or basis matrices) to estimate the proportions of different cell types in a heterogeneous DNA methylation sample. A signature matrix is a table where each row corresponds to a CpG site (or region), and each column corresponds to a cell type. The entries represent the expected methylation value (often as beta values) for each CpG in each pure cell type. - -Signature matrices are typically constructed from reference datasets where DNA methylation has been measured in sorted or purified cell populations. The choice of CpGs and the quality of the reference data are critical for accurate deconvolution. - -## Reference-Free Methods -Reference-free methods attempt to deconvolve cell-type proportions without requiring prior knowledge of cell-type-specific methylation signatures. These methods use computational approaches such as non-negative matrix factorization (NMF) or independent component analysis (ICA) to identify latent cell-type components directly from the mixed methylation data. - -## Why deconvMe Focuses on Reference-Based Methods - -deconvMe exclusively implements reference-based deconvolution methods because they generally provide **better performance and more reliable results** compared to reference-free approaches. Reference-based methods offer several advantages: - -- **Higher accuracy**: Pre-defined signature matrices based on purified cell populations provide more reliable cell-type identification -- **Better interpretability**: Results can be directly mapped to known cell types with biological meaning -- **Consistent results**: The same signature matrix applied to different datasets produces comparable results -- **Validation capability**: Results can be validated against known cell-type proportions when available - -Different reference-based methods use different strategies for selecting CpGs and constructing their signature matrices: - -- **EpiDISH** uses reference matrices for blood, breast, or epithelial cell types, based on published datasets. -- **Houseman** uses optimized sets of CpGs (e.g., IDOL-optimized) for blood and other tissues. -- **MethylCC** uses differentially methylated regions (DMRs) identified from reference data. -- **MethylResolver** uses a signature matrix derived from large-scale reference data, optimized for robust deconvolution. -- **MethAtlas** uses a comprehensive reference atlas, with options for tissue-wide or immune-specific signatures. - -The accuracy of cell-type deconvolution depends heavily on how well the signature matrix represents the true methylation profiles of the cell types present in your samples. In some cases, you may want to use a custom set of CpGs (for example, to focus on a subset relevant to your study or to match the coverage of your data). - -# Accessing Signature Matrices - -deconvMe provides functions to access the signature matrices used by each deconvolution method. These matrices define the reference methylation profiles for each cell type. +## Accessing Signature Matrices ```{r, message=FALSE, warning=FALSE} library(deconvMe) +library(dplyr) +library(tidyr) +library(ggplot2) options(matrixStats.useNames.NA = "deprecated") -``` - -## EpiDISH -```{r, message=FALSE, warning=FALSE} -# Get the blood signature matrix used by EpiDISH -sig_epidish <- get_epidish_signature_matrix(reference = "blood") -head(sig_epidish) +# Load example data +library(minfiData) +methyl_set <- minfiData::MsetEx +beta_matrix <- minfi::getBeta(minfi::ratioConvert(methyl_set)) ``` -## Houseman +All signature matrices follow a unified design - a `data.frame` with a `"CpGs"` column and cell types as additional columns: -```{r, message=FALSE, warning=FALSE} -# Get the Houseman signature matrix -sig_houseman <- get_houseman_signature_matrix() -head(sig_houseman) +```{r} +# Get signature matrices +epidish_sig <- get_epidish_signature_matrix("blood") +houseman_sig <- get_houseman_signature_matrix() +methatlas_sig <- get_methatlas_signature_matrix() +methylresolver_sig <- get_methylresolver_signature_matrix() + +# Check structure +cat("EpiDISH:", dim(epidish_sig), "\n") +cat("Houseman:", dim(houseman_sig), "\n") +cat("MethAtlas:", dim(methatlas_sig), "\n") +cat("MethylResolver:", dim(methylresolver_sig), "\n") ``` -## MethylCC +## Custom CpG Subsets + +You can use custom CpG subsets for deconvolution: ```{r, message=FALSE, warning=FALSE} -# Get the DMRs (signature matrix) used by MethylCC -sig_methylcc <- get_methylcc_signature_matrix() -head(sig_methylcc) +# Get overlapping CpGs between methods +custom_cpgs <- intersect(epidish_sig$CpGs, houseman_sig$CpGs) +cat("Overlapping CpGs:", length(custom_cpgs), "\n") + +# Run with custom CpGs +result_custom <- run_epidish(beta_matrix, cpg_subset = custom_cpgs) ``` -## MethylResolver +## Cross-Signature Deconvolution -```{r, message=FALSE, warning=FALSE} -# Get the signature matrix used by MethylResolver -sig_methylresolver <- get_methylresolver_signature_matrix() -head(sig_methylresolver) -``` +deconvMe supports **cross-signature deconvolution**, allowing you to use signature matrices from one method with the deconvolution algorithm of another method. -## MethAtlas +### Supported Methods -```{r, message=FALSE, warning=FALSE} -# Get the default reference matrix used by MethAtlas -sig_methatlas <- get_methatlas_signature_matrix() -head(sig_methatlas) -``` +| Method | Cross-Signature Support | +|--------|------------------------| +| EpiDISH | ✅ All methods | +| MethAtlas | ✅ All methods | +| MethylResolver | ✅ All methods | +| Houseman | ❌ Not supported | +| MethylCC | ❌ Not supported (no classical signature matrix) | -## Prepare Example Data +### Examples ```{r, message=FALSE, warning=FALSE} -library(minfiData) -methyl_set <- minfiData::MsetEx -beta_matrix <- minfi::getBeta(minfi::ratioConvert(methyl_set)) +# EpiDISH with Houseman signature +result_epidish_houseman <- run_epidish(beta_matrix, reference = houseman_sig) + +# MethAtlas with EpiDISH signature +result_methatlas_epidish <- run_methatlas(beta_matrix, reference = epidish_sig) + +# MethylResolver with MethAtlas signature +result_mr_methatlas <- run_methylresolver( + beta_matrix, + reference = methatlas_sig, + alpha = 1 +) ``` -# Example: Using Custom CpGs from EpiDISH and Houseman signatures +### Comparing Results -When using custom CpGs for deconvolution, it is important to ensure that the CpGs you select are present in the signature matrix for the method you are using. Here, we demonstrate this feature for EpiDISH and Houseman, as their signature matrices may have some overlap. For other methods, custom CpG support is available, but the overlap with EpiDISH/Houseman signatures may be zero, so we do not show those examples here. +```{r, fig.height=4, fig.width=8, message=FALSE, warning=FALSE} +# Compare EpiDISH with different signatures +result_epidish_blood <- run_epidish(beta_matrix, reference = "blood")$estF +result_epidish_houseman <- run_epidish(beta_matrix, reference = houseman_sig)$estF -```{r} -# Get CpGs from each method's signature matrix -cpgs_epidish <- sig_epidish$CpGs -cpgs_houseman <- sig_houseman$CpGs +# Rename cell types to unified names +colnames(result_epidish_blood) <- deconvMe::rename_cell_types(colnames(result_epidish_blood)) +colnames(result_epidish_houseman) <- deconvMe::rename_cell_types(colnames(result_epidish_houseman)) -# Example: intersection of CpGs between EpiDISH and Houseman -custom_cpgs <- intersect(cpgs_epidish, cpgs_houseman) -length(custom_cpgs) -``` +result_epidish_blood_long <- result_epidish_blood |> + as.data.frame() |> + tibble::rownames_to_column('sample') |> + tidyr::pivot_longer(-sample, names_to = 'celltype') |> + dplyr::mutate('method'='EpiDISH') -## Example: Running EpiDISH with Custom CpGs +result_epidish_houseman_long <- result_epidish_houseman |> + as.data.frame() |> + tibble::rownames_to_column('sample') |> + tidyr::pivot_longer(-sample, names_to = 'celltype') |> + dplyr::mutate('method'='Houseman') -```{r, message=FALSE, warning=FALSE} -# Use the method-specific subset -result_custom_epidish <- run_epidish(beta_matrix, cpg_subset = custom_cpgs) -``` +results <- dplyr::bind_rows(result_epidish_blood_long, result_epidish_houseman_long) -## Comparing Results: Custom CpGs vs Full Signature +deconvMe::results_aggregated_boxplot(results) + + ggplot2::labs(fill = "signature")+ + ggtitle('Deconvolution with EpiDISH, with two different signatures') -You can compare the deconvolution results obtained using the custom CpGs to those obtained using the full EpiDISH signature matrix: - -```{r, fig.height=4, fig.width=8, message=FALSE, warning=FALSE} -# Run EpiDISH with the full signature -result_full_epidish <- run_epidish(beta_matrix) - - -# Optionally, visualize the differences (e.g., for the first sample) using tidyverse -if (requireNamespace("tidyr", quietly = TRUE) && requireNamespace("dplyr", quietly = TRUE) && requireNamespace("ggplot2", quietly = TRUE)) { - library(dplyr) - library(tidyr) - library(ggplot2) - df_compare <- tibble( - CellType = colnames(result_full_epidish$estF), - Full = as.numeric(result_full_epidish$estF[1,]), - Custom = as.numeric(result_custom_epidish$estF[1, colnames(result_custom_epidish$estF)]) - ) %>% - pivot_longer(cols = c(Full, Custom), names_to = "Signature", values_to = "Fraction") - - ggplot(df_compare, aes(x = CellType, y = Fraction, fill = Signature)) + - geom_bar(stat = "identity", position = "dodge") + - labs(title = "EpiDISH: Full vs Custom CpGs (Sample 1)", y = "Estimated Fraction") + - scale_fill_manual(values = c("Full" = "bisque", "Custom" = "darkgreen"))+ - theme_minimal() -} ``` ---- -For more details on the available arguments and customization options, see the function documentation or the source code in the `R/` directory. +