Conversation
TuomasBorman
left a comment
There was a problem hiding this comment.
Some suggestions. Code looks already simple and good, just like it should be.
| setMethod( | ||
| "getTtest", "SummarizedExperiment", | ||
| function (x, assay.type = NULL, row.var = NULL, col.var = NULL, | ||
| formula, split.by = NULL, pair.by = NULL, features = NULL, | ||
| var.equal = FALSE, p.adjust.method = "fdr", ... ){ | ||
| ############################# Input check ############################## | ||
| group <- .check_input( |
There was a problem hiding this comment.
Run:
styler::style_file(path = "inst/pages/machine_learning.qmd", transformers = styler::tidyverse_style(indent_by = 4L))
|
@0xMuluh Any updates on this? |
TuomasBorman
left a comment
There was a problem hiding this comment.
Thanks!
I tent to think this is little bit too complex, especially, when we think about future extensions.
I would start with minimal working functionality first. Then we can extend the functions if needed, but already those minimal functionality should be sufficient for most of the problems.
The following methods below have been in a list to add. Some of them require data in long format while some need data in a wide format.
It might be easiest to first create for all thee methods get* function and minimal documentation. Once these are done, we can check how to harmonize their output.
Here are some code that I tested:
# Prepare data
library(mia)
data("peerj13075")
tse <- peerj13075
tse <- addAlpha(tse, index = "shannon")
tse <- agglomerateByRank(tse, "phylum")
tse <- transformAssay(tse, method = "pa")
tse <- transformAssay(tse, method = "rclr")
The following methods uses long-formatted data
getTTest(tse, counts ~ Geographical_location)
getWilcoxonTest(tse, counts ~ Geographical_location)
getAnova(tse, counts ~ Geographical_location)
getKruskal(tse, counts ~ Geographical_location)
getBrms(tse, rclr ~ Geographical_location + (1|rownames))
Here are their "generic" functions
getTTest <- function(tse, formula, pair.by = NULL, ...){
if( length(all.vars(formula[[3]])) != 1L ){
stop(".")
}
df <- .get_long_data(tse, formula, pair.by)
res <- .calculate_rstatix(df = df, formula = formula, pair.by = pair.by, FUN = rstatix::t_test, ...)
return(res)
}
getWilcoxonTest <- function(tse, formula, pair.by = NULL, ...){
if( length(all.vars(formula[[3]])) != 1L ){
stop(".")
}
df <- .get_long_data(tse, formula, pair.by)
res <- .calculate_rstatix(df = df, formula = formula, pair.by = pair.by, FUN = rstatix::wilcox_test, ...)
return(res)
}
getAnova <- function(tse, formula, ...){
if( length(all.vars(formula[[3]])) != 1L ){
stop(".")
}
df <- .get_long_data(tse, formula)
res <- .calculate_rstatix(df = df, formula = formula, FUN = rstatix::anova_test, ...)
return(res)
}
getKruskal <- function(tse, formula, ...){
if( length(all.vars(formula[[3]])) != 1L ){
stop(".")
}
df <- .get_long_data(tse, formula)
res <- .calculate_rstatix(df = df, formula = formula, FUN = rstatix::kruskal_test, ...)
return(res)
}
# Most of the modeling frameworks also use long-format so this would be easy extension. BUT probably we would not want to add these now.
getBrms <- function(tse, formula, ...){
library(brms)
df <- .get_long_data(tse, formula)
res <- brm(formula = formula, data = df)
return(res)
}
and here are the helper functions to wrangle the data and run the tests.
.get_long_data <- function(tse, formula, pair.by = NULL){
lhs <- .get_lhs(formula)
is_assay <- lhs %in% assayNames(tse)
is_coldata <- lhs %in% colnames(colData(tse))
is_rowdata <- lhs %in% colnames(rowData(tse))
if( is_assay ){
df <- .merge_tse(tse, lhs)
} else{
FUN <- if (is_coldata) colData else rowData
df <- tse |> FUN()
df[["colnames"]] <- rownames(df)
df[["rownames"]] <- lhs
}
if (!is.numeric(df[[lhs]])) {
stop("The dependent variable must be numeric.", call. = FALSE)
}
rhs <- .get_rhs(formula)
all_vars <- c(lhs, rhs, pair.by)
# Check that vars exist
if( !all(all_vars %in% colnames(df)) ){
stop(".")
}
df <- df[, c(all_vars, "colnames", "rownames"), drop = FALSE] |>
as.data.frame()
return(df)
}
.merge_tse <- function(tse, assay.type){
library(dplyr)
library(tidyr)
library(tibble)
rd <- tse |> rowData()
cd <- tse |> colData()
mat <- tse |> assay(assay.type)
mat <- mat |>
as.data.frame() |>
rownames_to_column("rownames") |>
pivot_longer(
cols = -rownames,
names_to = "colnames",
values_to = assay.type
)
rd <- rd |>
as.data.frame() |>
rownames_to_column("rownames")
cd <- cd |>
as.data.frame() |>
rownames_to_column("colnames")
df <- mat |>
dplyr::left_join(rd, by = "rownames") |>
dplyr::left_join(cd, by = "colnames")
return(df)
}
.get_lhs <- function(formula){
all_vars <- formula |> all.vars()
lhs <- all_vars[[1L]]
return(lhs)
}
.get_rhs <- function(formula){
all_vars <- formula |> all.vars()
rhs <- all_vars[-1]
return(rhs)
}
.calculate_rstatix <- function(df, formula, FUN, pair.by = NULL, ...) {
lhs <- .get_lhs(formula) |> rlang::sym()
rhs <- .get_rhs(formula) |> rlang::sym()
# Feature filtering, there must be variance. Otherwise test will fail
df <- df |>
dplyr::group_by(rownames, !!rhs) |>
dplyr::filter(
dplyr::n_distinct(!!lhs) > 1,
var(!!lhs, na.rm = TRUE) > 0
) |>
dplyr::ungroup()
df <- df |>
dplyr::group_by(rownames) |>
dplyr::filter(
dplyr::n_distinct(!!rhs) >= 2
)
# Paired samples are controlled in rstatix with sorting
if (!is.null(pair.by)) {
df <- df |> dplyr::arrange({{ pair.by }})
}
# Create an argument list
args <- list(
data = df,
formula = formula
)
if (!is.null(pair.by)) {
args[["paired"]] <- TRUE
}
args <- c(args, list(...))
# Call function
res <- do.call(FUN, args)
return(res)
}
Some of the function require the data in two separate matrices, one for abundance table and one for sample metadata. For instance, limma-workflow is like that. For these other methods below, we could use the same data format.
getLimma(tse, counts ~ Geographical_location)
getLm(tse, rclr ~ Geographical_location)
getLm(tse, shannon ~ Geographical_location)
getOrm(tse, counts ~ Geographical_location)
getFirth(tse, pa ~ Geographical_location + Age)
getLimma <- function(tse, formula, ...){
data_list <- .get_wide_data(tse, formula)
res <- .calculate_limma(formula = formula, mat = data_list[["matrix"]], metadata = data_list[["sample_metadata"]])
return(res)
}
getLm <- function(tse, formula, ...){
data_list <- .get_wide_data(tse, formula)
res <- .train_model_per_feature(formula = formula, mat = data_list[["matrix"]], metadata = data_list[["sample_metadata"]], FUN = .run_lm)
return(res)
}
getOrm <- function(tse, formula, ...){
data_list <- .get_wide_data(tse, formula)
res <- .train_model_per_feature(formula = formula, mat = data_list[["matrix"]], metadata = data_list[["sample_metadata"]], FUN = .run_orm)
return(res)
}
getFirth <- function(tse, formula, ...){
data_list <- .get_wide_data(tse, formula)
res <- .train_model_per_feature(formula = formula, mat = data_list[["matrix"]], metadata = data_list[["sample_metadata"]], FUN = .run_firth)
return(res)
}
Here are the helper function to wrangle the data and run the tests
.get_wide_data <- function(tse, formula){
lhs <- .get_lhs(formula)
rhs <- .get_rhs(formula)
if( !all(c(lhs, rhs) %in% c(assayNames(tse), colnames(colData(tse))) )){
stop(".")
}
is_assay <- lhs %in% assayNames(tse)
if( is_assay ){
lhs_df <- assay(tse, lhs)
} else{
lhs_df <- colData(tse)[, lhs, drop = FALSE] |> as.data.frame() |> t()
}
lhs_df <- lhs_df |> as.data.frame()
rhs_df <- colData(tse)[, rhs, drop = FALSE] |> as.data.frame()
if( !all(vapply(lhs_df, is.numeric, logical(1L)))){
stop(".")
}
data_list <- list(
matrix = lhs_df,
sample_metadata = rhs_df
)
return(data_list)
}
.calculate_limma <- function(formula, mat, metadata){
library(limma)
rhs_formula <- .get_rhs_formula(formula)
design_mat <- model.matrix(rhs_formula, data = metadata)
fit <- lmFit(mat, design_mat)
fit <- eBayes(fit)
return(fit)
}
.get_rhs_formula <- function(formula){
rhs_formula <- terms(formula) |>
delete.response() |>
as.formula()
return(rhs_formula)
}
.train_model_per_feature <- function(formula, mat, metadata, FUN = lm){
library(purrr)
mat |> # Extract relative abundances
t() |> # Transpose the assay
as.data.frame() |>
map(
~ FUN( # Run ORM for each taxon
.,
metadata = metadata, # Extract the metadata
formula = formula
)
) |>
bind_rows(.id = "rownames") |> # Bind the results of all taxa
mutate(
q_value = p.adjust(p_value, method = "BH")
)
}
.run_lm <- function(abundance, metadata, formula) {
# Create the design matrix of the model
mm <- .create_design_matrix(formula, metadata)
mm <- mm |>
cbind.data.frame(abundance)
# Fit Firth logistic regression model
fit <- lm(abundance ~ .,
data = mm
)
res <- broom::tidy(fit) |> column_to_rownames("term")
colnames(res) <- gsub("\\.", "_", colnames(res))
res <- res[!rownames(res) %in% c("(Intercept)"), , drop = FALSE]
res[["variable"]] <- rownames(res)
rownames(res) <- NULL
return(res)
}
.run_orm <- function(abundance, metadata, formula) {
library(rms)
# Create the design matrix of the model
mm <- .create_design_matrix(formula, metadata)
mm <- mm |>
cbind.data.frame(abundance)
inds <- 1:(ncol(mm) - 1)
vars <- colnames(mm)[inds]
# Fit ordinal regression model with all variables
# (Intercepts are automatically added)
fit_1 <- orm(abundance ~ ., data = mm)
score_1 <- fit_1$stats["Score"]
res <- data.frame(estimate = fit_1$coefficients[vars], p_value = NA)
# Calculate p-value based on score test for each variable
if (length(inds) == 1) {
res$p_value <- fit_1$stats["Score P"]
} else {
for (i in inds) {
fit_0 <- orm(abundance ~ ., data = mm[, -i])
score_0 <- fit_0$stats["Score"]
res$p_value[i] <- as.numeric(1 - pchisq(score_1 - score_0, df = 1))
}
}
res[["variable"]] <- rownames(res)
rownames(res) <- NULL
return(res)
}
.run_firth <- function(abundance, metadata, formula) {
library(logistf)
# Create the design matrix of the model
mm <- .create_design_matrix(formula, metadata)
mm <- mm |>
cbind.data.frame(abundance)
# Fit Firth logistic regression model
fit <- logistf(abundance ~ .,
data = mm,
control = logistf.control(maxit = 1000)
)
res <- data.frame(
estimate = fit$coefficients,
p_value = fit$prob
)
res <- res[!rownames(res) %in% c("(Intercept)"), , drop = FALSE]
res[["variable"]] <- rownames(res)
rownames(res) <- NULL
return(res)
}
.create_design_matrix <- function(formula, metadata){
formula <- .get_rhs_formula(formula)
dm <- model.matrix(formula, metadata) |>
as.data.frame()
dm[["(Intercept)"]] <- NULL
return(dm)
}
| Depends: | ||
| R (>= 4.5.0) | ||
| R (>= 4.5.0), | ||
| mia |
There was a problem hiding this comment.
mia could be dropped if possible to avoid dependencies.
Add SCE and SE as "Depends"
| #' @importFrom rstatix t_test adjust_pvalue | ||
| #' @importFrom S4Vectors DataFrame |
There was a problem hiding this comment.
Add these importFrom above the function that is using the package --> easier to keep track of them
| #' keep <- apply(assay_mat, 1, function(v) { | ||
| #' a <- v[grp == "Feces"] | ||
| #' b <- v[grp == "Skin"] | ||
| #' !(stats::var(a, na.rm = TRUE) == 0 && stats::var(b, na.rm = TRUE) == 0) | ||
| #' }) | ||
| #' tse <- tse[keep, ] |
There was a problem hiding this comment.
This is too complex for an example
| #' @param split.by \code{Character vector} or \code{NULL}. Columns to split by. | ||
| #' Tests are run separately for each combination. (Default: \code{NULL}) |
There was a problem hiding this comment.
I guess this is more rarely used. This could be "hidden" parameter under .... Less options is better for usability.
| if (!.is_non_empty_string(name)) { | ||
| stop("'name' must be a single character value.", call. = FALSE) | ||
| } |
There was a problem hiding this comment.
You could use .check_input as it harmonizes the input test and messages
| all_vars <- c(group, split.by, pair.by) | ||
| all_vars <- unique(all_vars[!sapply(all_vars, is.null)]) |
There was a problem hiding this comment.
If element of vector is NULL, it is removed. For instance, there is no such thing as
c(1, 2, NULL). It is c(1, 2) instead.
So is this line necessary (I have not checked in detail)?
| } else { | ||
| # Keep as plain data.frame for single test (no FeatureID/split.by) | ||
| df <- df | ||
| } |
| stop("'formula' must be a formula.", call. = FALSE) | ||
| } | ||
| # Get RHS of formula (handles both ~ group and value ~ group) | ||
| rhs <- as.character(formula)[length(as.character(formula))] |
No description provided.