From dfa4dc659db34232f9f4ba904386126ca16321fc Mon Sep 17 00:00:00 2001 From: emsonder Date: Thu, 31 Jul 2025 20:00:39 +0200 Subject: [PATCH 1/6] Fix: Naming and selection of associated motifs --- R/tfFeatures.R | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/R/tfFeatures.R b/R/tfFeatures.R index c0bd35a..75fdf87 100644 --- a/R/tfFeatures.R +++ b/R/tfFeatures.R @@ -141,7 +141,6 @@ .selectMotifs <- function(matchScores, maxScores, labels, - addThr=4, nMotifs=10, subSample=10000) { @@ -156,10 +155,9 @@ # top motif scores matchCoScores <- matchSubScores - matchCoScores@x[matchCoScores@x < thr[matchCoScores@j + 1] & - matchCoScores@x= thr[matchCoScores@j + 1] & - matchCoScores@x>=addThr*scalFactMotif] <- 1 + matchCoScores@x[matchCoScores@x < thr[matchCoScores@j + 1]] <- 0 + matchCoScores@x[matchCoScores@x >= thr[matchCoScores@j + 1]] <- 1 + matchCoScores <- drop0(matchCoScores) # get mutually exclusive motif scores zeroInd <- which(matchSubScores==0, arr.ind = TRUE) @@ -168,28 +166,34 @@ x=rep(1, nrow(zeroInd)), dims=c(nrow(matchSubScores), ncol(matchSubScores))) + colnames(matchExScores) <- colnames(matchSubScores) # jaccard index of mutually exclusive and top co-occuring motifs labels <- matrix(labels, nrow=length(labels), ncol=1) matchCo <- .jaccard(matchCoScores, labels) - matchCo[,motif_id:=1:.N] setorder(matchCo, -cont) - topCoMotif <- matchCo$motif_id[1:nMotifs] + topCoMotif <- matchCo$set1_col[1:nMotifs] matchEx <- .jaccard(matchExScores, labels) - matchEx[,motif_id:=1:.N] setorder(matchEx, -cont) - topExMotif <- matchEx$motif_id[1:nMotifs] + topExMotif <- matchEx$set1_col[1:nMotifs] - topExMotif <- intersect(topExMotif, colnames(matchScores)) - topCoMotif <- intersect(topCoMotif, colnames(matchScores)) selectedMotifs <- c(topCoMotif, topExMotif) if(length(selectedMotifs)>0){ - names(selectedMotifs) <- c(paste(COMOTIFAFFIX, 1:length(topCoMotif), sep="_"), - paste(EXMOTIFAFFIX, 1:length(topExMotif), sep="_"))} - - # can happen if the motif-matches matrix has less columns than motifs to select - selectedMotifs <- unique(selectedMotifs[!is.na(selectedMotifs)]) + if(length(topCoMotif)>0){ + namesCo <- paste(COMOTIFAFFIX, 1:length(topCoMotif), sep="_") + } + else{ + namesCo <- NULL + } + if(length(topExMotif)>0){ + namesEx <- paste(EXMOTIFAFFIX, 1:length(topExMotif), sep="_") + } + else{ + namesEx <- NULL + } + names(selectedMotifs) <- c(namesCo, namesEx) + } return(selectedMotifs) } @@ -510,8 +514,9 @@ tfFeatures <- function(mae, maxScores <- colDataMotifs[[MAXSCORECOL]] selMotifs <- .selectMotifs(matchScores, maxScores, labels, nMotifs=nMotifs) + print(names(selMotifs)) if(length(selMotifs)>0){ - names(selMotifs) <- paste0(SELMOTIFPREFIX, names(selMotifs))} + names(selMotifs) <- paste(SELMOTIFPREFIX, names(selMotifs), sep=".")} } else{ selMotifs <- NULL @@ -559,7 +564,7 @@ tfFeatures <- function(mae, actAssoc <- assays(mae[[ASSOCEXP]])[[ASSOCASSAY]] actAssoc <- actAssoc[,!c(colnames(actAssoc) %in% priorMotifCols), drop=FALSE] selActMotifs <- .selectMotifs(actAssoc, rep(1, ncol(actAssoc)), labels, - addThr=0, nMotifs=nMotifs) + nMotifs=nMotifs) if(length(selActMotifs)>0){ names(selActMotifs) <- paste0(SELMOTIFPREFIX, names(selActMotifs))} } From f066dfa36ba88db73fdfa8e23dd4d16ca862b9c0 Mon Sep 17 00:00:00 2001 From: emsonder Date: Fri, 1 Aug 2025 09:49:54 +0200 Subject: [PATCH 2/6] Adding scaling factor for activity estimates --- R/contextFeatures.R | 2 +- R/scalingFactors.R | 3 ++- R/tfFeatures.R | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/R/contextFeatures.R b/R/contextFeatures.R index 5065929..ae3cc10 100644 --- a/R/contextFeatures.R +++ b/R/contextFeatures.R @@ -119,7 +119,7 @@ saveHdf5, outDir){ data.table::setDTthreads(threads) - x <- as.integer(round(1000*cor(atacMat, assay(cvSe, NORMDEVASSAY)[motif,]))) + x <- as.integer(round(scaleFactAct*cor(atacMat, assay(cvSe, NORMDEVASSAY)[motif,]))) q <- as.integer(round(quantile(x, prob=c(0,0.1,0.2,0.8,0.9,1), na.rm=TRUE))) x[abs(x)0){ names(selActMotifs) <- paste0(SELMOTIFPREFIX, names(selActMotifs))} } From c963c1ae194224d57761daac6e12434f49742909 Mon Sep 17 00:00:00 2001 From: emsonder Date: Fri, 1 Aug 2025 10:46:58 +0200 Subject: [PATCH 3/6] Keep only exact cofactor motif matches --- R/tfFeatures.R | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/R/tfFeatures.R b/R/tfFeatures.R index 4a051fe..dac85ee 100644 --- a/R/tfFeatures.R +++ b/R/tfFeatures.R @@ -474,8 +474,7 @@ tfFeatures <- function(mae, names(tfSimMotifCols) <- paste(PRIORMOTIFPREFIX, 1:length(tfSimMotifCols), sep="_")} - tfCofactorCols <- unique(grep(paste(tfCofactors,collapse="|"), - motifNames, value=TRUE)) + tfCofactorCols <- intersect(tfCofactors, motifNames) if(length(tfCofactorCols)>0){ names(tfCofactorCols) <- paste(TFCOFACTORMOTIFPREFIX, 1:length(tfCofactorCols), sep="_")} @@ -514,7 +513,6 @@ tfFeatures <- function(mae, maxScores <- colDataMotifs[[MAXSCORECOL]] selMotifs <- .selectMotifs(matchScores, maxScores, labels, nMotifs=nMotifs) - print(names(selMotifs)) if(length(selMotifs)>0){ names(selMotifs) <- paste(SELMOTIFPREFIX, names(selMotifs), sep=".")} } @@ -531,8 +529,7 @@ tfFeatures <- function(mae, names(tfSimMotifCols) <- paste(PRIORMOTIFPREFIX, 1:length(tfSimMotifCols), sep="_")} - tfCofactorCols <- unique(grep(paste(tfCofactors,collapse="|"), - actMotifNames, value=TRUE)) + tfCofactorCols <- intersect(tfCofactors, actMotifNames) if(length(tfCofactorCols)>0){ names(tfCofactorCols) <- paste(TFCOFACTORMOTIFPREFIX, 1:length(tfCofactorCols), sep="_")} From 49bb23e449e1b4e55c506778a0b422eb54dfde7a Mon Sep 17 00:00:00 2001 From: emsonder Date: Fri, 1 Aug 2025 11:58:33 +0200 Subject: [PATCH 4/6] Critical Fix: Proper binding of non-context features during feature matrix construction --- R/getFeatureMatrix.R | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/R/getFeatureMatrix.R b/R/getFeatureMatrix.R index ebab494..db8a19b 100644 --- a/R/getFeatureMatrix.R +++ b/R/getFeatureMatrix.R @@ -71,10 +71,22 @@ mats <- lapply(mats, as, "TsparseMatrix") js <- cumsum(nCols) indDt <- lapply(1:length(mats), function(i){ - j <- js[[i]] + if(i==1){ + s <- 1 + }else{ + s <- js[[i-1]]+1 + } + jNew <- s:js[[i]] + jOld <- mats[[i]]@j + if(length(jOld)==0){ + repTimes <- 0} + else{ + repTimes <- table(jOld) + } + jFull <- rep(jNew, times=repTimes) indDt <- data.table(x=mats[[i]]@x, i=mats[[i]]@i+1, - j=rep(j, length(mats[[i]]@x)))}) + j=jFull)}) indDt <- rbindlist(indDt) # Get matrix @@ -326,13 +338,13 @@ getFeatureMatrix <- function(mae, colnames(featsNormedMat))] # normalize by maximum ATAC-signal - if(MAXATACCOLNAME %in% colnames(nonContextTfFeat)){ + if(MAXATACCOLNAME %in% colnames(otherFeatMat)){ whichCol <- grepl(paste(CONTEXTTFFEAT, INSERTFEATNAME, sep="_"), colnames(featsContextMat)) countCols <- c(colnames(featsContextMat)[whichCol], paste(CONTEXTFEAT, TOTALOVERLAPSFEATNAME, sep="_")) scaledSig <- .minMaxNormalization(featsContextMat[,countCols, drop=FALSE]) - maxSig <- nonContextTfFeat[,MAXATACCOLNAME, drop=TRUE] + maxSig <- otherFeatMat[,MAXATACCOLNAME, drop=TRUE] maxScaledMat <- scaledSig / pmax(maxSig, 1e-4) colnames(maxScaledMat) <- paste(colnames(maxScaledMat), NORMEDMAXAFFIX, sep="_") @@ -380,12 +392,12 @@ getFeatureMatrix <- function(mae, else{ return(featsMat) } - }, seAtac, seTfContext, - tfName, tfCofactors, - nonContextTfFeat, norm, - saveChunk, hdf5FileName, - annoCol, - addLabels, convertInteger) + }, seAtac=seAtac, seTfContext=seTfContext, + tfName=tfName, tfCofactors=tfCofactors, + otherFeatMat=nonContextTfFeat, norm=norm, + saveChunk=saveChunk, hdf5FileName=hdf5FileName, + annoCol=annoCol, addLabels=addLabels, + convertInteger=convertInteger) featMats <- Reduce("rbind", featMats[-1], featMats[[1]]) featMats <- suppressWarnings({Matrix::Matrix(featMats)}) From ff3450154bb7187d8e3253547f377d1547016384 Mon Sep 17 00:00:00 2001 From: emsonder Date: Fri, 1 Aug 2025 12:27:17 +0200 Subject: [PATCH 5/6] Fix: Saving feature matrix without labels as .h5 --- R/getFeatureMatrix.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/getFeatureMatrix.R b/R/getFeatureMatrix.R index db8a19b..e94988d 100644 --- a/R/getFeatureMatrix.R +++ b/R/getFeatureMatrix.R @@ -229,13 +229,13 @@ getFeatureMatrix <- function(mae, length(assays(seTfContext))+ length(assays(seAtac))+ length(intersect(colnames(colData(seAtac)), - paste(MDSDIMFEATNAME, 1:2, sep="_"))) + paste(MDSDIMFEATNAME, 1:2, sep="_")))+ + sum(addLabels) if(MAXATACCOLNAME %in% colnames(nonContextTfFeat)){ nFeats <- nFeats+sum(grepl(INSERTFEATNAME, names(assays(seTfContext))))+ sum(TOTALOVERLAPSFEATNAME %in% names(assays(seAtac))) } - if(addLabels) nFeats <- nFeats+1 # for context-label column if(saveHdf5) { From c777c847675ca0240be8078b40fab06159b5bfa5 Mon Sep 17 00:00:00 2001 From: emsonder Date: Fri, 1 Aug 2025 12:27:36 +0200 Subject: [PATCH 6/6] Increased version number: Various fixes regarding motif selection and feature matrix construction --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 2a5e2b9..83b4455 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: TFBlearner Title: Functionality for training TF-specific classifiers to predict TF bindings based on ATAC-seq data. -Version: 0.0.1.0001 +Version: 0.0.1.1000 Authors@R: person("Emanuel", "Sonder", , "emanuel.sonder@hest.ethz.ch", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-4788-9508"))