add tests by 0xMuluh · Pull Request #10 · microbiome/daa

0xMuluh · 2026-01-03T06:40:32Z

No description provided.

TuomasBorman

Some suggestions. Code looks already simple and good, just like it should be.

DESCRIPTION

TuomasBorman · 2026-01-20T12:48:59Z

R/getTtest.R

+setMethod(
+    "getTtest", "SummarizedExperiment",
+    function (x, assay.type = NULL, row.var = NULL, col.var = NULL,
+             formula, split.by = NULL, pair.by = NULL, features = NULL,
+             var.equal = FALSE, p.adjust.method = "fdr", ... ){
+        ############################# Input check ##############################
+        group <- .check_input(


Run:

styler::style_file(path = "inst/pages/machine_learning.qmd", transformers = styler::tidyverse_style(indent_by = 4L))

R/utils.R

R/getWilcoxon.R

TuomasBorman · 2026-02-23T11:33:12Z

@0xMuluh Any updates on this?

up

New stuff2

up

TuomasBorman

Thanks!

I tent to think this is little bit too complex, especially, when we think about future extensions.

I would start with minimal working functionality first. Then we can extend the functions if needed, but already those minimal functionality should be sufficient for most of the problems.

The following methods below have been in a list to add. Some of them require data in long format while some need data in a wide format.

It might be easiest to first create for all thee methods get* function and minimal documentation. Once these are done, we can check how to harmonize their output.

Here are some code that I tested:

# Prepare data
library(mia)
data("peerj13075")

tse <- peerj13075

tse <- addAlpha(tse, index = "shannon")
tse <- agglomerateByRank(tse, "phylum")
tse <- transformAssay(tse, method = "pa")
tse <- transformAssay(tse, method = "rclr")

The following methods uses long-formatted data

getTTest(tse, counts ~ Geographical_location)
getWilcoxonTest(tse, counts ~ Geographical_location)
getAnova(tse, counts ~ Geographical_location)
getKruskal(tse, counts ~ Geographical_location)
getBrms(tse, rclr ~ Geographical_location + (1|rownames))

Here are their "generic" functions

getTTest <- function(tse, formula, pair.by = NULL, ...){
    if( length(all.vars(formula[[3]])) != 1L ){
        stop(".")
    }
    df <- .get_long_data(tse, formula, pair.by)
    res <- .calculate_rstatix(df = df, formula = formula, pair.by = pair.by, FUN = rstatix::t_test, ...)
    return(res)
}

getWilcoxonTest <- function(tse, formula, pair.by = NULL, ...){
    if( length(all.vars(formula[[3]])) != 1L ){
        stop(".")
    }
    df <- .get_long_data(tse, formula, pair.by)
    res <- .calculate_rstatix(df = df, formula = formula, pair.by = pair.by, FUN = rstatix::wilcox_test, ...)
    return(res)
}
getAnova <- function(tse, formula, ...){
    if( length(all.vars(formula[[3]])) != 1L ){
        stop(".")
    }
    df <- .get_long_data(tse, formula)
    res <- .calculate_rstatix(df = df, formula = formula, FUN = rstatix::anova_test, ...)
    return(res)
}
getKruskal <- function(tse, formula, ...){
    if( length(all.vars(formula[[3]])) != 1L ){
        stop(".")
    }
    df <- .get_long_data(tse, formula)
    res <- .calculate_rstatix(df = df, formula = formula, FUN = rstatix::kruskal_test, ...)
    return(res)
}
# Most of the modeling frameworks also use long-format so this would be easy extension. BUT probably we would not want to add these now.
getBrms <- function(tse, formula, ...){
    library(brms)
    df <- .get_long_data(tse, formula)
    res <- brm(formula = formula, data = df)
    return(res)
}

and here are the helper functions to wrangle the data and run the tests.

.get_long_data <- function(tse, formula, pair.by = NULL){
    lhs <- .get_lhs(formula)

    is_assay <- lhs %in% assayNames(tse)
    is_coldata <- lhs %in% colnames(colData(tse))
    is_rowdata <- lhs %in% colnames(rowData(tse))

    if( is_assay ){
        df <- .merge_tse(tse, lhs)
    } else{
        FUN <- if (is_coldata) colData else rowData
        df <- tse |> FUN()
        df[["colnames"]] <- rownames(df)
        df[["rownames"]] <- lhs
    }

    if (!is.numeric(df[[lhs]])) {
        stop("The dependent variable must be numeric.", call. = FALSE)
    }
    rhs <- .get_rhs(formula)
    all_vars <- c(lhs, rhs, pair.by)

    # Check that vars exist
    if( !all(all_vars %in% colnames(df)) ){
        stop(".")
    }
    df <- df[, c(all_vars, "colnames", "rownames"), drop = FALSE] |>
        as.data.frame()
    return(df)
}

.merge_tse <- function(tse, assay.type){

    library(dplyr)
    library(tidyr)
    library(tibble)

    rd <- tse |> rowData()
    cd <- tse |> colData()
    mat <- tse |> assay(assay.type)

    mat <- mat |>
        as.data.frame() |>
        rownames_to_column("rownames") |>

        pivot_longer(
            cols = -rownames,
            names_to = "colnames",
            values_to = assay.type
        )

    rd <- rd |>
        as.data.frame() |>
        rownames_to_column("rownames")

    cd <- cd |>
        as.data.frame() |>
        rownames_to_column("colnames")

    df <- mat |>
        dplyr::left_join(rd, by = "rownames") |>
        dplyr::left_join(cd, by = "colnames")

    return(df)
}

.get_lhs <- function(formula){
    all_vars <- formula |> all.vars()
    lhs <- all_vars[[1L]]
    return(lhs)
}

.get_rhs <- function(formula){
    all_vars <- formula |> all.vars()
    rhs <- all_vars[-1]
    return(rhs)
}

.calculate_rstatix <- function(df, formula, FUN, pair.by = NULL, ...) {

    lhs <- .get_lhs(formula) |> rlang::sym()
    rhs <- .get_rhs(formula) |> rlang::sym()

    # Feature filtering, there must be variance. Otherwise test will fail
    df <- df |>
        dplyr::group_by(rownames, !!rhs) |>
        dplyr::filter(
            dplyr::n_distinct(!!lhs) > 1,
            var(!!lhs, na.rm = TRUE) > 0
        ) |>
        dplyr::ungroup()
    df <- df |>
        dplyr::group_by(rownames) |>
        dplyr::filter(
            dplyr::n_distinct(!!rhs) >= 2
        )

    # Paired samples are controlled in rstatix with sorting
    if (!is.null(pair.by)) {
        df <- df |> dplyr::arrange({{ pair.by }})
    }

    # Create an argument list
    args <- list(
        data = df,
        formula = formula
    )
    if (!is.null(pair.by)) {
        args[["paired"]] <- TRUE
    }
    args <- c(args, list(...))

    # Call function
    res <- do.call(FUN, args)

    return(res)
}

Some of the function require the data in two separate matrices, one for abundance table and one for sample metadata. For instance, limma-workflow is like that. For these other methods below, we could use the same data format.

getLimma(tse, counts ~ Geographical_location)
getLm(tse, rclr ~ Geographical_location)
getLm(tse, shannon ~ Geographical_location)
getOrm(tse, counts ~ Geographical_location)
getFirth(tse, pa ~ Geographical_location + Age)

getLimma <- function(tse, formula,  ...){
    data_list <- .get_wide_data(tse, formula)
    res <- .calculate_limma(formula = formula, mat = data_list[["matrix"]], metadata = data_list[["sample_metadata"]])
    return(res)
}

getLm <- function(tse, formula, ...){
    data_list <- .get_wide_data(tse, formula)
    res <- .train_model_per_feature(formula = formula, mat = data_list[["matrix"]], metadata = data_list[["sample_metadata"]], FUN = .run_lm)
    return(res)
}

getOrm <- function(tse, formula,  ...){
    data_list <- .get_wide_data(tse, formula)
    res <- .train_model_per_feature(formula = formula, mat = data_list[["matrix"]], metadata = data_list[["sample_metadata"]], FUN = .run_orm)
    return(res)

}

getFirth <- function(tse, formula,  ...){
    data_list <- .get_wide_data(tse, formula)
    res <- .train_model_per_feature(formula = formula, mat = data_list[["matrix"]], metadata = data_list[["sample_metadata"]], FUN = .run_firth)
    return(res)

}

Here are the helper function to wrangle the data and run the tests

.get_wide_data <- function(tse, formula){
    lhs <- .get_lhs(formula)
    rhs <- .get_rhs(formula)

    if( !all(c(lhs, rhs) %in% c(assayNames(tse), colnames(colData(tse))) )){
        stop(".")
    }

    is_assay <- lhs %in% assayNames(tse)
    if( is_assay ){
        lhs_df <- assay(tse, lhs)
    } else{
        lhs_df <- colData(tse)[, lhs, drop = FALSE] |> as.data.frame() |> t()
    }
    lhs_df <- lhs_df |> as.data.frame()

    rhs_df <- colData(tse)[, rhs, drop = FALSE] |> as.data.frame()

    if( !all(vapply(lhs_df, is.numeric, logical(1L)))){
        stop(".")
    }
    data_list <- list(
        matrix = lhs_df,
        sample_metadata = rhs_df
    )
    return(data_list)
}

.calculate_limma <- function(formula, mat, metadata){
    library(limma)

    rhs_formula <- .get_rhs_formula(formula)
    design_mat <-  model.matrix(rhs_formula, data = metadata)

    fit <- lmFit(mat, design_mat)
    fit <- eBayes(fit)
    return(fit)
}

.get_rhs_formula <- function(formula){
    rhs_formula <- terms(formula) |>
        delete.response() |>
        as.formula()
    return(rhs_formula)
}

.train_model_per_feature <- function(formula, mat, metadata, FUN = lm){
    library(purrr)
    mat |> # Extract relative abundances
        t() |> # Transpose the assay
        as.data.frame() |>
        map(
            ~ FUN( # Run ORM for each taxon
                .,
                metadata = metadata, # Extract the metadata
                formula = formula
            )
        ) |>
        bind_rows(.id = "rownames") |> # Bind the results of all taxa
        mutate(
            q_value = p.adjust(p_value, method = "BH")
        )
}

.run_lm <- function(abundance, metadata, formula) {
    # Create the design matrix of the model
    mm <- .create_design_matrix(formula, metadata)
    mm <- mm |>
        cbind.data.frame(abundance)


    # Fit Firth logistic regression model
    fit <- lm(abundance ~ .,
                   data = mm
    )

    res <- broom::tidy(fit) |>  column_to_rownames("term")
    colnames(res) <- gsub("\\.", "_", colnames(res))

    res <- res[!rownames(res) %in% c("(Intercept)"), , drop = FALSE]
    res[["variable"]] <- rownames(res)
    rownames(res) <- NULL

    return(res)
}

.run_orm <- function(abundance, metadata, formula) {
    library(rms)
    # Create the design matrix of the model
    mm <- .create_design_matrix(formula, metadata)
    mm <- mm |>
        cbind.data.frame(abundance)

    inds <- 1:(ncol(mm) - 1)
    vars <- colnames(mm)[inds]

    # Fit ordinal regression model with all variables
    # (Intercepts are automatically added)
    fit_1 <- orm(abundance ~ ., data = mm)
    score_1 <- fit_1$stats["Score"]

    res <- data.frame(estimate = fit_1$coefficients[vars], p_value = NA)

    # Calculate p-value based on score test for each variable
    if (length(inds) == 1) {
        res$p_value <- fit_1$stats["Score P"]
    } else {
        for (i in inds) {
            fit_0 <- orm(abundance ~ ., data = mm[, -i])
            score_0 <- fit_0$stats["Score"]
            res$p_value[i] <- as.numeric(1 - pchisq(score_1 - score_0, df = 1))
        }
    }

    res[["variable"]] <- rownames(res)
    rownames(res) <- NULL
    return(res)
}

.run_firth <- function(abundance, metadata, formula) {
    library(logistf)
    # Create the design matrix of the model
    mm <- .create_design_matrix(formula, metadata)
    mm <- mm |>
        cbind.data.frame(abundance)


    # Fit Firth logistic regression model
    fit <- logistf(abundance ~ .,
                   data = mm,
                   control = logistf.control(maxit = 1000)
    )

    res <- data.frame(
        estimate = fit$coefficients,
        p_value = fit$prob
    )

    res <- res[!rownames(res) %in% c("(Intercept)"), , drop = FALSE]
    res[["variable"]] <- rownames(res)
    rownames(res) <- NULL

    return(res)
}

.create_design_matrix <- function(formula, metadata){
    formula <- .get_rhs_formula(formula)
    dm <- model.matrix(formula, metadata) |>
        as.data.frame()
    dm[["(Intercept)"]] <- NULL
    return(dm)
}

TuomasBorman · 2026-03-24T14:45:09Z

DESCRIPTION

 Depends:
-    R (>= 4.5.0)
+    R (>= 4.5.0),
+    mia


mia could be dropped if possible to avoid dependencies.

Add SCE and SE as "Depends"

TuomasBorman · 2026-03-24T14:46:52Z

R/getTtest.R

+#' @importFrom rstatix t_test adjust_pvalue
+#' @importFrom S4Vectors DataFrame


Add these importFrom above the function that is using the package --> easier to keep track of them

TuomasBorman · 2026-03-24T14:47:41Z

R/getTtest.R

+#' keep <- apply(assay_mat, 1, function(v) {
+#'   a <- v[grp == "Feces"]
+#'   b <- v[grp == "Skin"]
+#'   !(stats::var(a, na.rm = TRUE) == 0 && stats::var(b, na.rm = TRUE) == 0)
+#' })
+#' tse <- tse[keep, ]


This is too complex for an example

TuomasBorman · 2026-03-24T14:51:30Z

R/getTtest.R

+#' @param split.by \code{Character vector} or \code{NULL}. Columns to split by.
+#' Tests are run separately for each combination. (Default: \code{NULL})


I guess this is more rarely used. This could be "hidden" parameter under .... Less options is better for usability.

TuomasBorman · 2026-03-24T14:53:25Z

R/getTtest.R

+        if (!.is_non_empty_string(name)) {
+            stop("'name' must be a single character value.", call. = FALSE)
+        }


You could use .check_input as it harmonizes the input test and messages

TuomasBorman · 2026-03-24T14:57:06Z

R/utils.R

+    all_vars <- c(group, split.by, pair.by)
+    all_vars <- unique(all_vars[!sapply(all_vars, is.null)])


If element of vector is NULL, it is removed. For instance, there is no such thing as

c(1, 2, NULL). It is c(1, 2) instead.

So is this line necessary (I have not checked in detail)?

TuomasBorman · 2026-03-24T15:11:16Z

R/utils.R

+    } else {
+        # Keep as plain data.frame for single test (no FeatureID/split.by)
+        df <- df
+    }


This is redundant

TuomasBorman · 2026-03-24T15:16:51Z

R/utils.R

+        stop("'formula' must be a formula.", call. = FALSE)
+    }
+    # Get RHS of formula (handles both ~ group and value ~ group)
+    rhs <- as.character(formula)[length(as.character(formula))]


up

e7653ed

TuomasBorman requested changes Jan 20, 2026

View reviewed changes

0xMuluh and others added 7 commits March 17, 2026 21:28

up

547cca7

Merge pull request #2 from 0xMuluh/new_stuff2

f4f4d0a

up

up

5ac523a

up

d838806

Merge pull request #3 from 0xMuluh/new_stuff2

ebb77ca

New stuff2

up

c7ca84a

Merge pull request #4 from 0xMuluh/new_stuff2

aa1ef55

up

0xMuluh requested a review from TuomasBorman March 23, 2026 07:25

TuomasBorman requested changes Mar 25, 2026

View reviewed changes

		#' @importFrom rstatix t_test adjust_pvalue
		#' @importFrom S4Vectors DataFrame

		#' @param split.by \code{Character vector} or \code{NULL}. Columns to split by.
		#' Tests are run separately for each combination. (Default: \code{NULL})

		all_vars <- c(group, split.by, pair.by)
		all_vars <- unique(all_vars[!sapply(all_vars, is.null)])

Conversation

0xMuluh commented Jan 3, 2026

Uh oh!

TuomasBorman left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

TuomasBorman commented Feb 23, 2026

Uh oh!

TuomasBorman left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants