From 93fec6e253591d34942b4ab063f33865f3b0daf7 Mon Sep 17 00:00:00 2001 From: emsonder Date: Fri, 24 Oct 2025 11:51:40 +0200 Subject: [PATCH 1/9] Remove ChIP-seq datasets corresponding to testing cellular contexts for binding pattern features --- R/tfFeatures.R | 9 ++++++++- tests/testthat/test-tfFeatures.R | 9 +++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/R/tfFeatures.R b/R/tfFeatures.R index 501ddf0..b1fd582 100644 --- a/R/tfFeatures.R +++ b/R/tfFeatures.R @@ -61,6 +61,10 @@ chIPMat <- .binMat(chIPMat, threshold=0) } + message(paste("Applying nonnegative matrix factorization on a matrix of dimension", + nrow(chIPMat),"x", ncol(chIPMat), collapse=" ")) + message(paste("Requested rank", nPatterns)) + if(is.null(aggFun)){ chIPMat <- as(chIPMat, "CsparseMatrix") nmfRes <- suppressMessages(RcppML::nmf(chIPMat, k=nPatterns, @@ -360,7 +364,10 @@ tfFeatures <- function(mae, # assay-matrices atacMat <- .convertToMatrix(assays(mae[[ATACEXP]])[[TOTALOVERLAPSFEATNAME]]) colnames(atacMat) <- colnames(mae[[ATACEXP]]) - whichCol <- which(mae[[CHIPEXP]][[TFNAMECOL]]!=tfName) + whichCol <- which(mae[[CHIPEXP]][[TFNAMECOL]]!=tfName & + !(mae[[CHIPEXP]][["combination"]] %in% + subset(sampleMap(mae), get(ISTESTCOL) & assay==CHIPEXP)$colname)) + chIPMat <- as(assays(mae[[CHIPEXP]])[[PEAKASSAY]][,whichCol],"CsparseMatrix") colnames(chIPMat) <- paste(colData(mae[[CHIPEXP]])[whichCol,annoCol], colData(mae[[CHIPEXP]])[whichCol,TFNAMECOL], diff --git a/tests/testthat/test-tfFeatures.R b/tests/testthat/test-tfFeatures.R index c4f960b..8ee4f60 100644 --- a/tests/testthat/test-tfFeatures.R +++ b/tests/testthat/test-tfFeatures.R @@ -66,3 +66,12 @@ test_that("No cofactors provided for co-binding features", { expect_warning(tfFeatures(maeTest2, tfName="JUN", tfCofactors=NULL), regexp="provide cofactor names") }) + +test_that("None of the testing ChIP-seq datasets used for binding pattern features", { + experiments(maeTest)[[TFFEAT]] <- NULL + experiments(maeTest2)[[TFFEAT]] <- NULL + expect_message(tfFeatures(maeTest, tfName="JUN", tfCofactors="CTCF"), + regexp="dimension 100 x 1") + expect_message(tfFeatures(maeTest2, tfName="JUN", tfCofactors="CTCF"), + regexp="dimension 100 x 1") +}) From 0e984f5ac23cf664c056dba9b756c19cc7f3dfe3 Mon Sep 17 00:00:00 2001 From: emsonder Date: Fri, 24 Oct 2025 12:25:20 +0200 Subject: [PATCH 2/9] Explicitly ensure same order of feature matrix columns during prediction and prediction --- R/predict.R | 17 +++++++---------- tests/testthat/test-predictTfBinding.R | 9 ++++++++- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/R/predict.R b/R/predict.R index 5e5500e..c19fcc7 100644 --- a/R/predict.R +++ b/R/predict.R @@ -54,18 +54,15 @@ predictTfBinding <- function(models, npDt, chunk, contextColName){ data.table::setDTthreads(floor(numThreads/nWorker)) - allFeats <- listFeatures() - colsToRemove <- unlist(subset(allFeats, - !included_in_training)$feature_matrix_column_names) - colsToRemove <- c(colsToRemove, LABELCOLNAME, contextColName, CSCORECOLNAME) - if(name==MODELALLNAME){ - colsToRemove <- setdiff(colsToRemove, CSCORECOLNAME) - } + # get feature order, to ensure the same order of features + modText <- model$save_model_to_string() + featOrder <- unlist(tstrsplit(modText, split="\n", keep=8)) + featOrder <- unlist(tstrsplit(gsub("feature_names=","", featOrder), split=" ")) if(!is.null(chunk) & is.numeric(chunk)){ predDts <- lapply(npDt, function(indDt){ - pred <- predict(model, as.matrix(data[indDt$ind, !(colnames(data) %in% colsToRemove)])) - predDt <- data.table(pred=pred) + preds <- predict(model, as.matrix(data[indDt$ind,featOrder])) + predDt <- data.table(pred=preds) if(sparsify) predDt[,pred:=fifelse(pred*scalFactPred>model$params[[SPARSETHR]],pred,0L)] predDt @@ -74,7 +71,7 @@ predictTfBinding <- function(models, predMat <- Matrix::Matrix(as.matrix(predDt)) } else{ - preds <- predict(model, as.matrix(data[,!(colnames(data) %in% colsToRemove)])) + preds <- predict(model, as.matrix(data[,featOrder])) predDt <- data.table(pred=preds) if(sparsify) predDt[,pred:=fifelse(pred*scalFactPred>model$params[[SPARSETHR]],pred,0L)] predMat <- Matrix::Matrix(as.matrix(predDt)) diff --git a/tests/testthat/test-predictTfBinding.R b/tests/testthat/test-predictTfBinding.R index 25bddd2..895711a 100644 --- a/tests/testthat/test-predictTfBinding.R +++ b/tests/testthat/test-predictTfBinding.R @@ -45,8 +45,15 @@ test_that("Predictions: sparsification",{ expect_no_error(preds <- predictTfBinding(modTest, fmTest, sparsify=TRUE)) }) - test_that("Predictions: chunking",{ preds <- NULL expect_no_error(preds <- predictTfBinding(modTest, fmTest, chunk=10)) }) + +# TODO: rethink this test, as lightgbm will not throw an error with mixed-up feature order. +# Meaning the case here does not check the feature order just that correct number of columns is removed +test_that("Feature order matches",{ + featCols <- sample(colnames(fmTest), replace=FALSE) + expect_no_error(predictTfBinding(modTest, fmTest[,featCols])) + expect_no_warning(predictTfBinding(modTest, fmTest[,featCols])) +}) From 3611083eb538d26ef1e995232804451880a4a1ce Mon Sep 17 00:00:00 2001 From: emsonder Date: Fri, 24 Oct 2025 15:07:13 +0200 Subject: [PATCH 3/9] Fixed motif id matching for profiles, added messages and warnings --- R/contextTfFeatures.R | 1 + R/getInsertionProfiles.R | 11 +++++++- tests/testthat/test-contextTfFeatures.R | 37 +++++++++++++++++++++++++ tests/testthat/test-insertionProfiles.R | 23 ++++++++++++++- 4 files changed, 70 insertions(+), 2 deletions(-) diff --git a/R/contextTfFeatures.R b/R/contextTfFeatures.R index 74e95b0..dea4e84 100644 --- a/R/contextTfFeatures.R +++ b/R/contextTfFeatures.R @@ -99,6 +99,7 @@ contextTfFeatures <- function(mae, motifPath <- subset(colData(maeSub[[MOTIFEXP]]), get(MOTIFNAMECOL)==tf)$origin baseDir <- metadata(colData(maeSub[[MOTIFEXP]]))[[BASEDIRCOL]] motifRanges <- readRDS(file.path(baseDir, motifPath)) + motifRanges$motif_id <- tfName if(addLabels){ colDataChIP <- as.data.table(colData(mae[[CHIPEXP]])) diff --git a/R/getInsertionProfiles.R b/R/getInsertionProfiles.R index 12d4fcc..7f736ac 100644 --- a/R/getInsertionProfiles.R +++ b/R/getInsertionProfiles.R @@ -116,6 +116,14 @@ getInsertionProfiles <- function(atacData, motifData[,motif_id:=1L] } + if(!calcProfile & !is.null(profiles) & + length(setdiff(motifData$motif_id, names(profiles)))>0){ + warning("Not all motif-ranges have an insertion-profile provided. + If wished to use a pre-computed profile provide one for all the motifs specified in the motifRanges arg. + Switching to computing profiles for all (calcProfile=FALSE).") + calcProfile <- TRUE + } + margin <- as.integer(margin) if(margin>0){ motifMarginRanges <- as.data.table(GenomicRanges::resize(motifRanges, @@ -180,6 +188,7 @@ getInsertionProfiles <- function(atacData, atacFrag <- split(atacFrag, by="chr") if(calcProfile){ + message("Computing insertion-profiles") atacProfiles <- BiocParallel::bpmapply(function(md, af, stranded, shiftLeft){ atacInserts <- .getInsertsPos(af, md, stranded, shiftLeft) atacProfile <- atacInserts[,.(pos_count_global=.N), @@ -238,7 +247,7 @@ getInsertionProfiles <- function(atacData, else{ atacProfiles <- profiles if(is.list(atacProfiles)){ - message("Using precomputed profiles") + message("Skipped insertion-profiles computation. Using provided pre-computed ones") atacProfiles <- rbindlist(atacProfiles, idcol="motif_id")} } diff --git a/tests/testthat/test-contextTfFeatures.R b/tests/testthat/test-contextTfFeatures.R index fe727a9..b98fe57 100644 --- a/tests/testthat/test-contextTfFeatures.R +++ b/tests/testthat/test-contextTfFeatures.R @@ -54,3 +54,40 @@ test_that("Error if features have not been computed for provided TF", { tfName="JUN" expect_error(contextTfFeatures(maeTest, tfName=tfName)) }) + +test_that("Using precomputed profile", { + experiments(maeTest)[[CONTEXTTFFEAT]] <- NULL + profile <- data.table(rel_pos=-200:200) + profile[,w:=1/nrow(profile)] + profile <- list("CTCF"=profile) + + expect_message(contextTfFeatures(maeTest, tfName="CTCF", + insertionProfile=profile), + regexp="Using pre-computed insertion-profiles") + expect_message(maeTest <- contextTfFeatures(maeTest, tfName="CTCF", + insertionProfile=profile), + regexp="Skipped insertion-profiles computation. Using provided pre-computed ones") + expect_equal(sum(is.na(assays(maeTest[["contextTfFeat"]])$contextTfFeat_weightedInserts.margin_tfMotif_1)), 0) + expect_equal(sum(is.na(assays(maeTest[["contextTfFeat"]])$contextTfFeat_weightedInserts.within_tfMotif_1)), 0) + + expect_no_warning(contextTfFeatures(maeTest, tfName="CTCF", + insertionProfile=profile)) + expect_no_message(contextTfFeatures(maeTest, tfName="CTCF", + insertionProfile=profile), + message="Computing insertion-profiles") +}) + +test_that("Warning when using precomputed profile - not maching the motifRanges by name", { + experiments(maeTest)[[CONTEXTTFFEAT]] <- NULL + profile <- data.table(rel_pos=-200:200) + profile[,w:=1/nrow(profile)] + profile <- list("ATF2"=profile) + + expect_warning(contextTfFeatures(maeTest, tfName="CTCF", + whichCol="OnlyTrain", + insertionProfile=profile), + regexp="*Not all motif-ranges*") + expect_message(suppressWarnings(contextTfFeatures(maeTest, tfName="CTCF", + insertionProfile=profile)), + regexp="Computing insertion-profiles") +}) diff --git a/tests/testthat/test-insertionProfiles.R b/tests/testthat/test-insertionProfiles.R index 918be5d..4162ad4 100644 --- a/tests/testthat/test-insertionProfiles.R +++ b/tests/testthat/test-insertionProfiles.R @@ -83,7 +83,7 @@ test_that("Using precomputed profiles", { symmetric=TRUE, profiles=profiles, margin=10), - regexp="Using precomputed profiles") + regexp="Skipped insertion-profiles computation. Using provided pre-computed ones") profiles <- rbindlist(profiles, idcol="motif_id") expect_equal(insRes$insertProfiles, profiles) }) @@ -121,3 +121,24 @@ test_that("Simplified output format", { DEVFEATNAME)) expect_identical(motifCoords, rowRanges(insRes)) }) + +test_that("Warning when using precomputed profile - not maching the motifRanges by name", { + profile <- data.table(rel_pos=-200:200) + profile[,w:=1/nrow(profile)] + profile <- list("ATF2"=profile) + + motifRanges <- readRDS(exampleMotif[["JUN"]]) + motifRanges$motif_id <- "JUN" + + atacData <- fread(exampleATAC$K562) + expect_warning(getInsertionProfiles(atacData, + motifRanges=motifRanges, + profiles=profile, + calcProfile=FALSE), + regexp="Not all motif-ranges have an insertion-profile provided") + expect_message(suppressWarnings(getInsertionProfiles(atacData, + motifRanges=motifRanges, + profiles=profile, + calcProfile=FALSE)), + regexp="Computing insertion-profiles") +}) From 777cc38d114db298aac1366b14c9fcea94b9d3d6 Mon Sep 17 00:00:00 2001 From: emsonder Date: Fri, 24 Oct 2025 15:26:12 +0200 Subject: [PATCH 4/9] Capture ... argument named profiles passed on to insertionProfiles --- R/contextTfFeatures.R | 10 ++++++++++ tests/testthat/test-contextTfFeatures.R | 26 +++++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/R/contextTfFeatures.R b/R/contextTfFeatures.R index dea4e84..be50cd8 100644 --- a/R/contextTfFeatures.R +++ b/R/contextTfFeatures.R @@ -115,6 +115,16 @@ contextTfFeatures <- function(mae, names(labels) <- contexts } + addArgs <- list(...) + if("profiles" %in% names(addArgs)){ + if(is.null(insertionProfile)){ + insertionProfile <- addArgs[["profiles"]]} + else{ + warning("Provided duplicated argument, profiles (via ...) and insertionProfile. + Using insertionProfile.") + } + } + # loop over contexts to get the features message("Get insert features") labels <- labels[contexts] # ensure ordering diff --git a/tests/testthat/test-contextTfFeatures.R b/tests/testthat/test-contextTfFeatures.R index b98fe57..1f4eb0f 100644 --- a/tests/testthat/test-contextTfFeatures.R +++ b/tests/testthat/test-contextTfFeatures.R @@ -91,3 +91,29 @@ test_that("Warning when using precomputed profile - not maching the motifRanges insertionProfile=profile)), regexp="Computing insertion-profiles") }) + +test_that("Using precomputed profile - add provided via ... (profiles) arg", { + experiments(maeTest)[[CONTEXTTFFEAT]] <- NULL + profile <- data.table(rel_pos=-200:200) + profile[,w:=1/nrow(profile)] + profile <- list("CTCF"=profile) + + expect_message(contextTfFeatures(maeTest, tfName="CTCF", + profiles=profile), + regexp="Using pre-computed insertion-profiles") + expect_message(maeTest <- contextTfFeatures(maeTest, tfName="CTCF", + profiles=profile), + regexp="Skipped insertion-profiles computation. Using provided pre-computed ones") +}) + +test_that("Warning when using precomputed profille - via ... (profiles) and insertionProfile arg",{ + experiments(maeTest)[[CONTEXTTFFEAT]] <- NULL + profile <- data.table(rel_pos=-200:200) + profile[,w:=1/nrow(profile)] + profile <- list("CTCF"=profile) + + expect_warning(contextTfFeatures(maeTest, tfName="CTCF", + profiles=profile, + insertionProfile=profile), + regexp="*Provided duplicated argument*") +}) From 81af0830ccb70d965c50926764d4761b62411da7 Mon Sep 17 00:00:00 2001 From: emsonder Date: Fri, 24 Oct 2025 16:28:44 +0200 Subject: [PATCH 5/9] Get insertion-profiles on all training contexts if not provided - instead of per context profiles --- R/contextTfFeatures.R | 30 ++++++++++++----- tests/testthat/test-contextTfFeatures.R | 43 ++++++++++++++++++++++--- 2 files changed, 60 insertions(+), 13 deletions(-) diff --git a/R/contextTfFeatures.R b/R/contextTfFeatures.R index be50cd8..cf6b88d 100644 --- a/R/contextTfFeatures.R +++ b/R/contextTfFeatures.R @@ -125,6 +125,27 @@ contextTfFeatures <- function(mae, } } + # pre-compute insertion-profile on training data + if(is.null(insertionProfile) & "Weighted_Inserts" %in% features){ + message("No insertion-profile provided, pre-computing on training data") + + trainCols <- unique(subset(sampleMap(mae), get(ISTRAINCOL))$colname) + labContexts <- getContexts(mae, tfName=tfName, which="Both") + trainContexts <- intersect(trainCols, labContexts) + + atacTrainFragFilePaths <- unlist(subset(colData(mae[[ATACEXP]]), + get(annoCol) %in% trainContexts)$origin) + baseDir <- metadata(colData(mae[[ATACEXP]]))[[BASEDIRCOL]] + atacTrainFragPaths <- file.path(baseDir, atacTrainFragFilePaths) + + ins <- getInsertionProfiles(atacTrainFragPaths, motifRanges) + insertionProfile <- list(ins$insertProfiles) + names(insertionProfile) <- tfName + } + else if(!("Weighted_Inserts" %in% features)){ + insertionProfile <- NULL + } + # loop over contexts to get the features message("Get insert features") labels <- labels[contexts] # ensure ordering @@ -134,20 +155,13 @@ contextTfFeatures <- function(mae, threads, BPPARAM, ...){ data.table::setDTthreads(threads) - calcProfile <- FALSE - if("Weighted_Inserts" %in% features & is.null(profile)){ - calcProfile <- TRUE - } - else if("Weighted_Inserts" %in% features & !is.null(profile)){ - message("Using pre-computed insertion-profiles") - } - atacFrag <- atacFrag[names(atacFrag)==context] addArgs <- list(...) addArgs <- addArgs[names(addArgs) %in% c("margin", "shift", "subSample","symmetric", "stranded")] + calcProfile <- FALSE args <- c(list(atacData=atacFrag, motifRanges=motifRanges, profiles=profile, calcProfile=calcProfile, BPPARAM=BPPARAM), addArgs) diff --git a/tests/testthat/test-contextTfFeatures.R b/tests/testthat/test-contextTfFeatures.R index 1f4eb0f..2daa5e6 100644 --- a/tests/testthat/test-contextTfFeatures.R +++ b/tests/testthat/test-contextTfFeatures.R @@ -61,9 +61,14 @@ test_that("Using precomputed profile", { profile[,w:=1/nrow(profile)] profile <- list("CTCF"=profile) - expect_message(contextTfFeatures(maeTest, tfName="CTCF", + contextTfFeatures(maeTest, tfName="CTCF", + insertionProfile=profile) + expect_no_message(contextTfFeatures(maeTest, tfName="CTCF", insertionProfile=profile), - regexp="Using pre-computed insertion-profiles") + message="No insertion-profile provided, pre-computing on training data") + expect_no_message(contextTfFeatures(maeTest, tfName="CTCF", + insertionProfile=profile), + message="Computing insertion-profiles") expect_message(maeTest <- contextTfFeatures(maeTest, tfName="CTCF", insertionProfile=profile), regexp="Skipped insertion-profiles computation. Using provided pre-computed ones") @@ -98,15 +103,18 @@ test_that("Using precomputed profile - add provided via ... (profiles) arg", { profile[,w:=1/nrow(profile)] profile <- list("CTCF"=profile) - expect_message(contextTfFeatures(maeTest, tfName="CTCF", + expect_no_message(contextTfFeatures(maeTest, tfName="CTCF", profiles=profile), - regexp="Using pre-computed insertion-profiles") + message="No insertion-profile provided, pre-computing on training data") + expect_no_message(contextTfFeatures(maeTest, tfName="CTCF", + profiles=profile), + message="Computing insertion-profiles") expect_message(maeTest <- contextTfFeatures(maeTest, tfName="CTCF", profiles=profile), regexp="Skipped insertion-profiles computation. Using provided pre-computed ones") }) -test_that("Warning when using precomputed profille - via ... (profiles) and insertionProfile arg",{ +test_that("Warning when using precomputed profile - via ... (profiles) and insertionProfile arg",{ experiments(maeTest)[[CONTEXTTFFEAT]] <- NULL profile <- data.table(rel_pos=-200:200) profile[,w:=1/nrow(profile)] @@ -117,3 +125,28 @@ test_that("Warning when using precomputed profille - via ... (profiles) and ins insertionProfile=profile), regexp="*Provided duplicated argument*") }) + +test_that("Using message when no pre-computed profile provided", { + experiments(maeTest)[[CONTEXTTFFEAT]] <- NULL + + expect_message(contextTfFeatures(maeTest, tfName="CTCF", + insertionProfile=NULL), + regexp="No insertion-profile provided, pre-computing on training data") + expect_message(contextTfFeatures(maeTest, tfName="CTCF", + insertionProfile=NULL), + regexp="Computing insertion-profiles") +}) + +test_that("Weighted_Inserts not in features", { + experiments(maeTest)[[CONTEXTTFFEAT]] <- NULL + + expect_no_message(contextTfFeatures(maeTest, features="Inserts", + tfName="CTCF"), + message="No insertion-profile provided, pre-computing on training data") + expect_no_message(maeTest <- contextTfFeatures(maeTest, tfName="CTCF", + features="Inserts"), + message="Computing insertion-profiles") + + expect_equal(sum(grepl("weightedInserts", names(assays(maeTest[["contextTfFeat"]])))), + 0) +}) From 9a52a8c4fadb2b666c0c51f7f1c3053015a93e3a Mon Sep 17 00:00:00 2001 From: emsonder Date: Fri, 24 Oct 2025 18:06:15 +0200 Subject: [PATCH 6/9] Fix: Normalization of NA containing columns --- R/getFeatureMatrix.R | 20 +++++++++++--------- R/helpers.R | 12 ++++++------ tests/testthat/test-getFeatureMatrix.R | 21 +++++++++++++++++++++ 3 files changed, 38 insertions(+), 15 deletions(-) diff --git a/R/getFeatureMatrix.R b/R/getFeatureMatrix.R index 9d277cf..f0752db 100644 --- a/R/getFeatureMatrix.R +++ b/R/getFeatureMatrix.R @@ -13,34 +13,36 @@ return(mat) } -.robustNormalization <- function(mat){ - qs <- .marginQuant(mat, probs=c(0.25,0.5,0.75), margin="col") +.robustNormalization <- function(mat, naRm=TRUE){ + qs <- .marginQuant(mat, probs=c(0.25,0.5,0.75), margin="col", naRm=naRm) Matrix::Matrix(t(t(sweep(mat, 2, qs[2,], "-"))/max((qs[3,]-qs[1,]),1e-5))) } -.minMaxNormalization <- function(mat, useMax=FALSE){ +.minMaxNormalization <- function(mat, useMax=FALSE, naRm=TRUE){ if(useMax){ - qs <- .marginQuant(mat, probs=c(0.0,1.0), margin="col") + qs <- .marginQuant(mat, probs=c(0.0,1.0), margin="col", naRm=naRm) } else{ - qs <- .marginQuant(mat, probs=c(0.0,0.9), margin="col") + qs <- .marginQuant(mat, probs=c(0.0,0.9), margin="col", naRm=naRm) } Matrix::Matrix(t(t(mat)/max((qs[2,]-qs[1,]),1e-5))) } .contextNormalization <- function(mat, method=c("robust", "min-max", - "column", "none")){ + "column", "none"), + naRm=TRUE){ method <- match.arg(method, choices=c("robust", "min-max", "column", "none")) if(method=="column"){ - normMat <- Matrix::t(Matrix::t(mat)/pmax(colSums(mat), rep(1e-5, nrow(mat)))) + normMat <- Matrix::t(Matrix::t(mat)/pmax(colSums(mat, na.rm=naRm), + rep(1e-5, nrow(mat)))) } else if(method=="min-max"){ - normMat <- .minMaxNormalization(mat) + normMat <- .minMaxNormalization(mat, naRm=naRm) } else if(method=="robust"){ - normMat <- .robustNormalization(mat) + normMat <- .robustNormalization(mat, naRm=naRm) } else if(method=="none") { diff --git a/R/helpers.R b/R/helpers.R index 8446ae8..73e34f6 100644 --- a/R/helpers.R +++ b/R/helpers.R @@ -11,7 +11,7 @@ return(mat) } -.marginMax <- function(mat, margin=c("row", "col")){ +.marginMax <- function(mat, margin=c("row", "col"), naRm=TRUE){ margin <- match.arg(margin, choices=c("col", "row")) if(margin=="row"){fun <- MatrixGenerics::rowMaxs} @@ -20,11 +20,11 @@ if(!is(mat, "CsparseMatrix") & !is(mat, "TsparseMatrix")){ mat <- as.matrix(mat) } - marginMax <- fun(mat) + marginMax <- fun(mat, na.rm=naRm) return(marginMax) } -.marginSum <- function(mat, margin=c("row", "col")){ +.marginSum <- function(mat, margin=c("row", "col"), naRm=TRUE){ margin <- match.arg(margin, choices=c("col", "row")) if(margin=="row"){fun <- MatrixGenerics::rowSums} @@ -33,11 +33,11 @@ if(!is(mat, "CsparseMatrix") & !is(mat, "TsparseMatrix")){ mat <- as.matrix(mat) } - marginMax <- fun(mat) + marginMax <- fun(mat, na.rm=naRm) return(marginMax) } -.marginQuant <- function(mat, probs, margin=c("row", "col")){ +.marginQuant <- function(mat, probs, margin=c("row", "col"), naRm=TRUE){ margin <- match.arg(margin, choices=c("col", "row")) if(margin=="row"){fun <- MatrixGenerics::rowQuantiles} @@ -45,7 +45,7 @@ if(!is(mat, "CsparseMatrix") & !is(mat, "TsparseMatrix")){ mat <- as.matrix(mat) } - marginQuant <- t(fun(mat, probs=probs)) + marginQuant <- t(fun(mat, probs=probs, na.rm=naRm)) return(marginQuant) } diff --git a/tests/testthat/test-getFeatureMatrix.R b/tests/testthat/test-getFeatureMatrix.R index 768dd8a..232c237 100644 --- a/tests/testthat/test-getFeatureMatrix.R +++ b/tests/testthat/test-getFeatureMatrix.R @@ -208,3 +208,24 @@ test_that("Feature Matrix: correct features are normalized", { normedCols <- colnames(fm)[grepl(NORMEDAFFIX,colnames(fm))] expect_length(setdiff(normedCols, colNamesAll), 0) }) + +test_that("Feature Matrix: normalization does not fail with NA containing columns", { + + randMat <- matrix(runif(1e3), ncol=10, nrow=100) + randMat[sample(1:length(randMat),100)] <- NA + + robNormed <- .robustNormalization(randMat) + expect_equal(which(is.na(robNormed)), which(is.na(randMat))) + expect_equal(which(is.na(robNormed)), which(is.na(randMat))) + expect_equal(sum(!is.na(robNormed)), sum(!is.na(randMat))) + + minMaxNormed <- .minMaxNormalization(randMat) + expect_equal(which(is.na(minMaxNormed)), which(is.na(randMat))) + expect_equal(which(is.na(minMaxNormed)), which(is.na(randMat))) + expect_equal(sum(!is.na(minMaxNormed)), sum(!is.na(randMat))) + + colNormed <- .contextNormalization(randMat, method="column") + expect_equal(which(is.na(colNormed)), which(is.na(randMat))) + expect_equal(which(is.na(colNormed)), which(is.na(randMat))) + expect_equal(sum(!is.na(colNormed)), sum(!is.na(randMat))) +}) From 3ed90442f07ea4b1f53d2eca1c2bc5a1d56f208c Mon Sep 17 00:00:00 2001 From: emsonder <38004531+emsonder@users.noreply.github.com> Date: Mon, 27 Oct 2025 14:30:50 +0100 Subject: [PATCH 7/9] Fixed warning when pre-computed profiles provided only for a subset of motifs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- R/getInsertionProfiles.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/getInsertionProfiles.R b/R/getInsertionProfiles.R index 7f736ac..4d377a6 100644 --- a/R/getInsertionProfiles.R +++ b/R/getInsertionProfiles.R @@ -120,7 +120,7 @@ getInsertionProfiles <- function(atacData, length(setdiff(motifData$motif_id, names(profiles)))>0){ warning("Not all motif-ranges have an insertion-profile provided. If wished to use a pre-computed profile provide one for all the motifs specified in the motifRanges arg. - Switching to computing profiles for all (calcProfile=FALSE).") + Switching to computing profiles for all (calcProfile=TRUE).") calcProfile <- TRUE } From fdc6fb5cd2be90fb684c0b9a17fe813b35314510 Mon Sep 17 00:00:00 2001 From: emsonder Date: Mon, 27 Oct 2025 16:22:10 +0100 Subject: [PATCH 8/9] Increased version number --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 52693ae..bbce521 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: TFBlearner Title: Functionality for training TF-specific classifiers to predict TF bindings based on ATAC-seq data. -Version: 0.1.1 +Version: 0.1.2 Authors@R: person("Emanuel", "Sonder", , "emanuel.sonder@hest.ethz.ch", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-4788-9508")) From 2ef394932f73c3a6db821c39778083e5e8a478dc Mon Sep 17 00:00:00 2001 From: emsonder Date: Tue, 28 Oct 2025 11:00:31 +0100 Subject: [PATCH 9/9] Added error-trigger in case no feature_names row in model-string for retrieving feature names/order --- R/predict.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R/predict.R b/R/predict.R index c19fcc7..12f9790 100644 --- a/R/predict.R +++ b/R/predict.R @@ -57,6 +57,9 @@ predictTfBinding <- function(models, # get feature order, to ensure the same order of features modText <- model$save_model_to_string() featOrder <- unlist(tstrsplit(modText, split="\n", keep=8)) + if(!any(grepl("feature_names", featOrder))){ + stop("Feature names could not be retrieved from the provided model.") + } featOrder <- unlist(tstrsplit(gsub("feature_names=","", featOrder), split=" ")) if(!is.null(chunk) & is.numeric(chunk)){