From 7dee9f63971a2a11139d1405d56a8a24e8d0781d Mon Sep 17 00:00:00 2001 From: leprohonmalo Date: Tue, 11 Feb 2020 09:37:49 +0800 Subject: [PATCH 01/10] Factorized some code from dataProject.R --- R/panelProjection.R | 51 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 R/panelProjection.R diff --git a/R/panelProjection.R b/R/panelProjection.R new file mode 100644 index 0000000..ea674ae --- /dev/null +++ b/R/panelProjection.R @@ -0,0 +1,51 @@ +#' Compute Reference Component features for clustering analysis +#' +#' @param rca.obj RCA object. +#' @param panel Reference panel as a matrix +#' @param corMeth Any of the correlation measures supported by R, defaults to pearson +#' @param power power to raise up to for the RCA features before clustering, default is 4 +#' @param scale True if the data should be scaled, False otherwise +#' @param apply_threshold True if a threshold to raise the lowest expression values needs to be applied on panel data, False otherwise. +#' @param threshold Minimal wished expression value for panel data. All lower expression values in panel will be raised to this value if apply_threshold is TRUE. +#' @return RCA object. +#' @export +#' + +panelProjection = function(sc_data, panel, corMeth="pearson", power=4, scale=TRUE, apply_threshold=FALSE, threshold=NULL) { + + # Select genes that are shared by the input data and the panel + shared_genes <- intersect(rownames(sc_data), rownames(panel)) + print(paste0("Projection on ", length(shared_genes), " genes.")) + + # Reduce the panel and input data to the shared genes + subset_panel = panel[shared_genes, ] + subset_data = sc_data[shared_genes, , drop = FALSE] + + # For values in the panel below the minimum threshold, set those values to threshold + if (apply_threshold) { + subset_panel[subset_panel <= threshold] = threshold + } + + # Compute projection of input data with the panel + if(corMeth == "pearson") { + subset_panel = as.matrix(subset_panel) + projection <- qlcMatrix::corSparse(X = subset_panel, Y = subset_data) + } else { + projection <- cor(subset_panel, subset_data, method = corMeth) + } + rownames(projection) <- colnames(subset_panel) + colnames(projection) <- colnames(subset_data) + + # Raise the projection to power + projection = abs(projection) ^ (power) * sign(projection) + + # If scaling is required + if (scale) { + # Scale + projection = scale(projection, + center = TRUE, + scale = TRUE) + } + + return(projection) +} \ No newline at end of file From 7bf4a471e4fe69dc7cc10f4b279bd8dcbbb2c7eb Mon Sep 17 00:00:00 2001 From: leprohonmalo Date: Tue, 11 Feb 2020 09:41:28 +0800 Subject: [PATCH 02/10] Factorized version of dataProject.R, working on implementing ColonEpitheliumPanel --- R/dataProject.R | 142 +++++++++--------------------------------------- 1 file changed, 27 insertions(+), 115 deletions(-) diff --git a/R/dataProject.R b/R/dataProject.R index 22c3be3..4304cbd 100644 --- a/R/dataProject.R +++ b/R/dataProject.R @@ -1,3 +1,5 @@ +source("panelProjection.R") + #' Compute Reference Component features for clustering analysis #' #' @param rca.obj RCA object. @@ -30,45 +32,14 @@ dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corM # For each fragment of the Global Panel for (i in 1:length(ReferencePanel[[1]])) { - - # Initialise panel - panel = ReferencePanel[[1]][[i]] - - # Select genes that are shared by the input data and the panel - shared_genes <- intersect(rownames(sc_data), rownames(panel)) - - # Reduce the panel and input data to the shared genes - subset_panel = panel[shared_genes, ] - subset_data = sc_data[shared_genes, , drop = FALSE] - - # For values in the panel below the minimum threshold, set those values to threshold - subset_panel[subset_panel <= (ReferencePanel$at)[i]] = (ReferencePanel$at)[i] - - # Compute projection of input data with the panel fragment - if(corMeth == "pearson") { - subset_panel = as.matrix(subset_panel) - projection_fragment <- qlcMatrix::corSparse(X = subset_panel, Y = subset_data) - } else { - projection_fragment <- cor(subset_panel, subset_data, method = corMeth) - } - - - # Reattach dimnames - colnames(projection_fragment) <- colnames(subset_data) - rownames(projection_fragment) <- colnames(subset_panel) - - # Raise the projection fragment to power - projection_fragment = abs(projection_fragment) ^ (power) * sign(projection_fragment) - - # If scaling is required - if (scale) { - - # Scale - projection_fragment = scale(projection_fragment, center = TRUE, scale = TRUE) - } - + # Store projection data of fragment of Global Panel - projection_list[[i]] = projection_fragment + projection_list[[i]] = panelProjection(sc_data, + ReferencePanel[[1]][[i]], + corMeth=corMeth, + power=power, scale=scale, + apply_threshold=TRUE, + threshold=(ReferencePanel$at)[i]) } # Combine the projection result of multiple Global Panel fragments @@ -80,63 +51,29 @@ dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corM # Load reference panel data from environment data(ReferencePanel, envir = environment()) - + panel = ReferencePanel$ColonEpiPanel + gene.names = as.character(str_extract_all(rownames(panel), "_[[:alnum:]]+_")) + rownames(panel) = str_sub(gene.names, 2, -2) # Scale panel by median - fc = apply(ReferencePanel$ColonEpiPanel, 1, function(x) x - median(x)) - + fc = apply(panel, 1, function(x) x - median(x)) fs = fc > 1.5 - - fs1 = rownames(ReferencePanel$ColonEpiPanel[apply(fs, 1, function(x) - sum(x)) > 0,]) - gl_intersect = intersect(rownames(fpkm_temp), fs1) - projection = as.data.frame(cor(fpkm_temp[gl_intersect,], ReferencePanel$ColonEpiPanel[gl_intersect,], corMeth)) - projection = abs(projection) ^ (power) * sign(projection) - if (scale) { - projection = scale(projection, - center = TRUE, - scale = TRUE) - } + panel + panel[apply(fs, 1, function(x) + sum(x)) > 0,] + + # Store projection data + projection= panelProjection(sc_data, panel, corMeth=corMeth, + power=power, scale=scale) } + # If panel for correlation is ENCODEPanel else if (method == "ENCODEPanel") { # Load reference panel data from environment data("ENCODEPanel", envir = environment()) - panel <- ENCODEPanel - - # Initialise variable to store projection data from the two fragments of the Global Panel - projection_list = list() - - # Select genes that are shared by the input data and the panel - shared_genes <- intersect(rownames(sc_data), rownames(panel)) - - # Reduce the panel and input data to the shared genes - subset_panel = panel[shared_genes, ] - subset_data = sc_data[shared_genes, , drop = FALSE] - - # Compute projection of input data with the panel - if(corMeth == "pearson") { - subset_panel = as.matrix(subset_panel) - projection <- qlcMatrix::corSparse(X = subset_panel, Y = subset_data) - } else { - projection <- cor(subset_panel, subset_data, method = corMeth) - } - rownames(projection) <- colnames(subset_panel) - colnames(projection) <- colnames(subset_data) - - # Raise the projection to power - projection = abs(projection) ^ (power) * sign(projection) - - # If scaling is required - if (scale) { - - # Scale - projection = scale(projection, - center = TRUE, - scale = TRUE) - } - + # Store projection data + projection= panelProjection(sc_data, ENCODEPanel, corMeth=corMeth, + power=power, scale=scale) } # If no provided method is chosen, it is assumed that the user wishes to use a custom panel @@ -144,35 +81,10 @@ dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corM # Load panel from path provided panel <- readRDS(customPath) - - # Select genes that are shared by the input data and the panel - shared_genes <- intersect(rownames(sc_data), rownames(panel)) - - # Reduce the panel and input data to the shared genes - subset_panel = panel[shared_genes, ] - subset_data = sc_data[shared_genes, , drop = FALSE] - - # Compute projection of input data with the panel - if(corMeth == "pearson") { - subset_panel = as.matrix(subset_panel) - projection <- qlcMatrix::corSparse(X = subset_panel, Y = subset_data) - } else { - projection <- cor(subset_panel, subset_data, method = corMeth) - } - rownames(projection) <- colnames(subset_panel) - colnames(projection) <- colnames(subset_data) - - # Raise the projection to power - projection = abs(projection) ^ (power) * sign(projection) - - # If scaling is required - if (scale) { - - # Scale - projection = scale(projection, - center = TRUE, - scale = TRUE) - } + + # Store projection data + projection = panelProjection(sc_data, panel, corMeth=corMeth, + power=power, scale=scale) } # Store projection result as Matrix From 04274217da24a58b109162fd0914471f9b366514 Mon Sep 17 00:00:00 2001 From: leprohonmalo Date: Tue, 11 Feb 2020 14:51:20 +0800 Subject: [PATCH 03/10] Finished to implement ColonEpitheliumPanel in dataProject --- R/dataProject.R | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/R/dataProject.R b/R/dataProject.R index 4304cbd..1dc193b 100644 --- a/R/dataProject.R +++ b/R/dataProject.R @@ -1,5 +1,5 @@ source("panelProjection.R") - +source("~/Work/RCAv2/R/panelProjection.R") #' Compute Reference Component features for clustering analysis #' #' @param rca.obj RCA object. @@ -52,14 +52,16 @@ dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corM # Load reference panel data from environment data(ReferencePanel, envir = environment()) panel = ReferencePanel$ColonEpiPanel - gene.names = as.character(str_extract_all(rownames(panel), "_[[:alnum:]]+_")) - rownames(panel) = str_sub(gene.names, 2, -2) # Scale panel by median fc = apply(panel, 1, function(x) x - median(x)) fs = fc > 1.5 - panel + panel[apply(fs, 1, function(x) + panel = panel[apply(fs, 1, function(x) sum(x)) > 0,] - + # Convert rownames to gene symbol and remove duplicates + gene.names = as.character(str_extract_all(rownames(panel), "_.+_")) + gene.names = str_sub(gene.names, 2, -2) + panel = panel[-which(duplicated(gene.names)),] + rownames(panel) = gene.names[-which(duplicated(gene.names))] # Store projection data projection= panelProjection(sc_data, panel, corMeth=corMeth, power=power, scale=scale) From 844fb22ea9aec0e27ea708e4d8a0b87f05b86c03 Mon Sep 17 00:00:00 2001 From: leprohonmalo Date: Thu, 13 Feb 2020 10:13:54 +0800 Subject: [PATCH 04/10] Pre-processing of ColonEpitheliumPanel now combines duplicates instead of removing them. --- R/dataProject.R | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/R/dataProject.R b/R/dataProject.R index 1dc193b..43e76dd 100644 --- a/R/dataProject.R +++ b/R/dataProject.R @@ -52,16 +52,26 @@ dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corM # Load reference panel data from environment data(ReferencePanel, envir = environment()) panel = ReferencePanel$ColonEpiPanel + # Convert rownames to gene symbol and sum duplicates + gene.names = as.character(str_extract_all(rownames(panel), "_.+_")) + gene.names = str_sub(gene.names, 2, -2) + panel = cbind(panel, gene.names) + dup.genes = unique(gene.names[duplicated(gene.names)]) + for (gene in dup.genes) { + sub.panel = panel[which(gene.names == gene),1:(ncol(panel) - 1)] + new.row = apply(sub.panel, MARGIN = 2, FUN = "sum") + for (i in which(gene.names == gene)) { + panel[i, 1:(ncol(panel) - 1)] = new.row + } + } + panel = panel[-which(duplicated(panel$gene.names)),] + rownames(panel) = panel$gene.names + panel = panel[,-ncol(panel)] # Scale panel by median fc = apply(panel, 1, function(x) x - median(x)) fs = fc > 1.5 panel = panel[apply(fs, 1, function(x) sum(x)) > 0,] - # Convert rownames to gene symbol and remove duplicates - gene.names = as.character(str_extract_all(rownames(panel), "_.+_")) - gene.names = str_sub(gene.names, 2, -2) - panel = panel[-which(duplicated(gene.names)),] - rownames(panel) = gene.names[-which(duplicated(gene.names))] # Store projection data projection= panelProjection(sc_data, panel, corMeth=corMeth, power=power, scale=scale) @@ -100,3 +110,4 @@ dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corM return(rca.obj) } + \ No newline at end of file From 7a9ed0bf4229a760f332d437cc6fbbd2b23c521f Mon Sep 17 00:00:00 2001 From: leprohonmalo Date: Mon, 17 Feb 2020 14:47:19 +0800 Subject: [PATCH 05/10] Added ctQueryReg a function to select cell-type (for panel) with a query using regular expression. --- R/ctQueryReg.R | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 R/ctQueryReg.R diff --git a/R/ctQueryReg.R b/R/ctQueryReg.R new file mode 100644 index 0000000..91003a7 --- /dev/null +++ b/R/ctQueryReg.R @@ -0,0 +1,63 @@ +#' Function to parse from a character vector all character matching all keywords from a query. +#' +#' @param query character or character vector. +#' @param db a character vector to parse query from. If remakedb=FALSE it should be a matrix with each extracted word of db as second column and with corresonding full character of db as first column. +#' @param remakedb a boolean value, default is TRUE. Put to FALSE (you should not do this) if you can provide a matrix as db as describe in db parameter. +#' @return a character vector with all matches for one query. +#' @details +#' This function will look for matches between a query and a character database using regular expression for text mining. +#' This function is not perfect and require some good practices: +#' - Take look at your database to design your query. +#' - Include both litteral and acronym form (if it exist) of a term in distinct query character. +#' - Avoid using plural forms in query. +#' - Correclty spell your query. +#' - Database will not always be clean, and concatenated words (for instance: forinstance) +#' will not be detected but with a specific corresponding concatened query. +#' @export +#' +ctQueryReg = function(query, db ,remakedb=TRUE) { + # Parse words in db if needed + if (remakedb) { + new_db = matrix(ncol = 2) + for (i in db) { + ibis = gsub('([[:upper:]][[:lower:]])', ' \\1', i) + words = as.vector(str_split(ibis, regex("[[:punct:] ]+"), simplify = TRUE)) + one_ct = matrix(data = cbind(rep(i,length(words)), words), ncol=2, nrow = length(words)) + new_db = rbind(new_db, one_ct) + } + colnames(new_db) = c("book", "word") + new_db = as.data.frame(new_db[-c(1,which(new_db[,2] == "")),]) + } else { + new_db = db + } + # If user provides several queries for one db, apply this function for each query + if (length(query) > 1) { + db.match.list = apply(as.array(query), MARGIN = 1, FUN = "ct_query_reg", db=db, remakedb=FALSE) + db.match = unlist(db.match.list) + if (length(which(duplicated(db.match))) > 0) { + db.match = db.match[-which(duplicated(db.match))] + } + } else { + # Parse words in query + query.words = gsub('([[:upper:]][[:lower:]])', ' \\1', query) + query.words = as.vector(str_split(query.words, regex("[[:punct:] ]+"), simplify = TRUE)) + query.words = query.words[query.words != ""] + db.match = c() + # For each words in query, detect matching characters in db and + # Keep intersect with mactches of other words + for (i in 1:length(query.words)) { + query.reg = paste0("^", query.words[i], "s?$") + db.match.word = new_db[str_detect(new_db[,2], regex(query.reg, ignore_case = TRUE)), 1] + if (i == 1) { + db.match = db.match.word + } else { + db.match = intersect(db.match, db.match.word) + } + } + # Try to find matches for a concatened form of the full query + query.reg = paste0(c("^", query.words, "S?$"), collapse = "") + db.match.word = new_db[str_detect(new_db[,2], regex(query.reg, ignore_case = TRUE)), 1] + db.match = union(db.match, db.match.word) + } + return(db.match) +} From bd5cfb055f283f65ba1c2a15165f59122dbc9afe Mon Sep 17 00:00:00 2001 From: leprohonmalo Date: Tue, 18 Feb 2020 10:13:43 +0800 Subject: [PATCH 06/10] Corrected function's name in apply line --- R/ctQueryReg.R | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/R/ctQueryReg.R b/R/ctQueryReg.R index 91003a7..564ddbd 100644 --- a/R/ctQueryReg.R +++ b/R/ctQueryReg.R @@ -15,7 +15,7 @@ #' will not be detected but with a specific corresponding concatened query. #' @export #' -ctQueryReg = function(query, db ,remakedb=TRUE) { +ctQueryReg = function(query, db, remakedb=TRUE) { # Parse words in db if needed if (remakedb) { new_db = matrix(ncol = 2) @@ -32,7 +32,7 @@ ctQueryReg = function(query, db ,remakedb=TRUE) { } # If user provides several queries for one db, apply this function for each query if (length(query) > 1) { - db.match.list = apply(as.array(query), MARGIN = 1, FUN = "ct_query_reg", db=db, remakedb=FALSE) + db.match.list = apply(as.array(query), MARGIN = 1, FUN = "ctQueryReg", db=new_db, remakedb=FALSE) db.match = unlist(db.match.list) if (length(which(duplicated(db.match))) > 0) { db.match = db.match[-which(duplicated(db.match))] @@ -58,6 +58,9 @@ ctQueryReg = function(query, db ,remakedb=TRUE) { query.reg = paste0(c("^", query.words, "S?$"), collapse = "") db.match.word = new_db[str_detect(new_db[,2], regex(query.reg, ignore_case = TRUE)), 1] db.match = union(db.match, db.match.word) + if (length(db.match) == 0) { + print(paste0("No match for query: ", query)) + } } return(db.match) } From 1f6d57c74983235ae12f1a621d57ab6b884b0444 Mon Sep 17 00:00:00 2001 From: leprohonmalo Date: Tue, 18 Feb 2020 10:20:50 +0800 Subject: [PATCH 07/10] Added cell type selection to panelProjection --- R/panelProjection.R | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/R/panelProjection.R b/R/panelProjection.R index ea674ae..820ab88 100644 --- a/R/panelProjection.R +++ b/R/panelProjection.R @@ -1,6 +1,8 @@ +source("ctQueryReg.R") + #' Compute Reference Component features for clustering analysis #' -#' @param rca.obj RCA object. +#' @param sc_data data field from an RCA object. #' @param panel Reference panel as a matrix #' @param corMeth Any of the correlation measures supported by R, defaults to pearson #' @param power power to raise up to for the RCA features before clustering, default is 4 @@ -11,21 +13,42 @@ #' @export #' -panelProjection = function(sc_data, panel, corMeth="pearson", power=4, scale=TRUE, apply_threshold=FALSE, threshold=NULL) { +panelProjection = function(sc_data, + panel, + corMeth="pearson", + power=4, + scale=TRUE, + apply_threshold=FALSE, + threshold=NULL, + ctSelection = NULL) { + + # Apply selection of cell types + if (!is.null(ctSelection)) { + selected.ct = ctQueryReg(ctSelection, colnames(panel)) + if (length(selected.ct) != 0) { + genes.names = rownames(panel) + panel = as.matrix(panel[,selected.ct]) + rownames(panel) = genes.names + colnames(panel) = selected.ct + } else { + print("No match found for this ctSelection, no projection was done.") + return(NULL) + } + } # Select genes that are shared by the input data and the panel shared_genes <- intersect(rownames(sc_data), rownames(panel)) print(paste0("Projection on ", length(shared_genes), " genes.")) # Reduce the panel and input data to the shared genes - subset_panel = panel[shared_genes, ] + subset_panel = as.matrix(panel[shared_genes, ]) + colnames(subset_panel) = colnames(panel) subset_data = sc_data[shared_genes, , drop = FALSE] # For values in the panel below the minimum threshold, set those values to threshold if (apply_threshold) { subset_panel[subset_panel <= threshold] = threshold } - # Compute projection of input data with the panel if(corMeth == "pearson") { subset_panel = as.matrix(subset_panel) @@ -46,6 +69,5 @@ panelProjection = function(sc_data, panel, corMeth="pearson", power=4, scale=TRU center = TRUE, scale = TRUE) } - return(projection) -} \ No newline at end of file +} From 5370dd1466415d43bda4c4d6263959fdc0f536bf Mon Sep 17 00:00:00 2001 From: leprohonmalo Date: Tue, 18 Feb 2020 10:24:21 +0800 Subject: [PATCH 08/10] Added cell type selection to dataProject and handling of failed projection due to cell type selection --- R/dataProject.R | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/R/dataProject.R b/R/dataProject.R index 43e76dd..f293e01 100644 --- a/R/dataProject.R +++ b/R/dataProject.R @@ -1,5 +1,4 @@ source("panelProjection.R") -source("~/Work/RCAv2/R/panelProjection.R") #' Compute Reference Component features for clustering analysis #' #' @param rca.obj RCA object. @@ -8,10 +7,17 @@ source("~/Work/RCAv2/R/panelProjection.R") #' @param corMeth Any of the correlation measures supported by R, defaults to pearson #' @param power power to raise up to for the RCA features before clustering, default is 4 #' @param scale True if the data should be scaled, False otherwise +#' @param ctSelection A character vector of cell types if not NULL. Only selected cell types will be kept in projection. #' @return RCA object. #' @export #' -dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corMeth = "pearson", power = 4, scale = T) { +dataProject <- function(rca.obj, + method = "GlobalPanel", + customPath = NULL, + corMeth = "pearson", + power = 4, + scale = T, + ctSelection = NULL) { # Extract data sc_data <- rca.obj$data @@ -39,7 +45,8 @@ dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corM corMeth=corMeth, power=power, scale=scale, apply_threshold=TRUE, - threshold=(ReferencePanel$at)[i]) + threshold=(ReferencePanel$at)[i], + ctSelection = ctSelection) } # Combine the projection result of multiple Global Panel fragments @@ -74,7 +81,8 @@ dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corM sum(x)) > 0,] # Store projection data projection= panelProjection(sc_data, panel, corMeth=corMeth, - power=power, scale=scale) + power=power, scale=scale, + ctSelection = ctSelection) } # If panel for correlation is ENCODEPanel @@ -85,7 +93,8 @@ dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corM # Store projection data projection= panelProjection(sc_data, ENCODEPanel, corMeth=corMeth, - power=power, scale=scale) + power=power, scale=scale, + ctSelection = ctSelection) } # If no provided method is chosen, it is assumed that the user wishes to use a custom panel @@ -96,9 +105,14 @@ dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corM # Store projection data projection = panelProjection(sc_data, panel, corMeth=corMeth, - power=power, scale=scale) + power=power, scale=scale, + ctSelection = ctSelection) + } + + if (is.null(projection)) { + print("Not any projection was succesfull for this panel.") + return(NULL) } - # Store projection result as Matrix projection = as.matrix(projection) projection = as(projection, "dgCMatrix") From 7e2e6de3a06ec47b996cff693ccb101b19e7e86e Mon Sep 17 00:00:00 2001 From: leprohonmalo Date: Tue, 18 Feb 2020 10:25:39 +0800 Subject: [PATCH 09/10] added multiRCAPanel.R, a function for combination of projection data. --- R/multiRCAPanel.R | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 R/multiRCAPanel.R diff --git a/R/multiRCAPanel.R b/R/multiRCAPanel.R new file mode 100644 index 0000000..f16981a --- /dev/null +++ b/R/multiRCAPanel.R @@ -0,0 +1,38 @@ +#' Function to combine base reference panels for one projection. +#' +#' @param rca.obj RCA object. +#' @param panel.name Vector of methods from dataProject to select panels ("GlobalPanel", "ColonEpitheliumPanel", "ENCODEPanel" and "Custom"). +#' @param customPath Character vector with corresponding customPath to "Custom" panel name for dataProject. +#' @param corMeth Character vector with corresponding customPath to "Custom" panel name for dataProject. +#' @param power Numeric vector for corresponding power parameter to pass to dataProject for each method in panel.name. +#' @param scale Boolean vector for corresponding scale parameter to pass to dataProject for each method in panel.name. +#' @param ctSelection List of character vector for corresponding ctSelect parameter to pass to dataProject for each method in panel.name. +#' +#' @export +#' +multiRCAPanel = function(rca.obj, + panel.name=c("GlobalPanel", "ENCODEPanel", "ColonEpitheliumPanel"), + customPath = rep(NULL, length(panel.name)), + corMeth = rep("pearson", length(panel.name)), + power = rep(4, length(panel.name)), + scale = rep(TRUE, length(panel.name)), + ctSelection = rep(NULL, length(panel.name))) { + + full.projection = list() + for (i in 1:length(panel.name)) { + panel = panel.name[i] + print(panel) + if (panel %in% c("GlobalPanel", "ENCODEPanel", "ColonEpitheliumPanel", "Custom")) { + dataProject(rca.obj = rca.obj, method = panel, + corMeth = corMeth[i], power = power[i], + scale = scale[i], ctSelection = ctSelection[[i]]) + full.projection[[panel]] = rca.obj$projection.data + } else { + print("panel.name must be methods from dataProject().") + } + } + + rca.obj$projection.data = do.call("rbind", full.projection) + return(rca.obj) +} + From f7db697752d6523b759e2b1928feba7cd48b3850 Mon Sep 17 00:00:00 2001 From: leprohonmalo Date: Tue, 18 Feb 2020 14:49:09 +0800 Subject: [PATCH 10/10] Added a function to select cell types based on a tf-idf score, but did not implement this function in other functions yet. --- R/ctQueryTFIDF.R | 100 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 R/ctQueryTFIDF.R diff --git a/R/ctQueryTFIDF.R b/R/ctQueryTFIDF.R new file mode 100644 index 0000000..592c37e --- /dev/null +++ b/R/ctQueryTFIDF.R @@ -0,0 +1,100 @@ +#' Function to parse from a character vector all character matching keywords from a query using TF-IDF metric. +#' +#' @param query character or character vector. +#' @param db a character vector to parse query from. If remakedb=FALSE it should be a matrix with each extracted word of db as second column and with corresonding full character of db as first column. +#' @param remakedb a boolean value, default is TRUE. Put to FALSE (you should not do this) if you can provide a matrix as db as describe in db parameter. +#' @return a character vector with all matches for one query. +#' @details +#' This function will look for matches between a query and a character database using TF-IDF for text mining. +#' This function is not perfect and require some good practices: +#' - Take look at your database to design your query. +#' - Include both litteral and acronym form (if it exist) of a term in distinct query character. +#' - Avoid using plural forms in query. +#' - Correclty spell your query. +#' - Database will not always be clean, and concatenated words (for instance: forinstance) +#' will not be detected but with a specific corresponding concatened query. +#' @export +#' +ctQueryTFIDF = function(query, db, threshold=1, remakedb=TRUE) { + + # Parse words in db if needed + if (remakedb) { + new_db = matrix(ncol = 2) + for (i in db) { + ibis = gsub('([[:upper:]][[:lower:]])', ' \\1', i) + words = as.vector(str_split(ibis, regex("[[:punct:] ]+"), simplify = TRUE)) + one_ct = matrix(data = cbind(rep(i,length(words)), words), ncol=2, nrow = length(words)) + new_db = rbind(new_db, one_ct) + } + colnames(new_db) = c("book", "word") + new_db = as.data.frame(new_db[-c(1,which(new_db[,2] == "")),]) + + # Homogenize word spelling: + for (wd in unique(new_db[,2])) { + word.reg = paste0("^", wd, "s?$") + db.match.word = str_detect(new_db[,2], regex(word.reg, ignore_case = TRUE)) + # May need optimazation + new_db[which(db.match.word),2] = wd + } + + # Compute word count and tf_idf + new_db = new_db %>% count(book, word) + total_words = new_db %>% + group_by(book) %>% + summarize(total = sum(n)) + new_db = left_join(new_db, total_words) + new_db = new_db %>% + bind_tf_idf(word, book, n) + + # Create books x words tf_idf matrix + db.words = unique(new_db$word) + db.names = unique(new_db$book) + tfidf.table = matrix(0L, nrow = length(db.words), ncol = length(db.names)) + colnames(tfidf.table) = sort(db.names) + rownames(tfidf.table) = sort(db.words) + for (i in 1:nrow(new_db)) { + tfidf.table[as.vector(new_db$word[i]), as.vector(new_db$book[i])] = new_db$tf_idf[i] + } + + } else { + tfidf.table = db + } + # If user provides several queries for one db, apply this function for each query + if (length(query) > 1) { + db.match.list = apply(as.array(query), MARGIN = 1, FUN = "ctQueryTFIDF", + db=tfidf.table, threshold=threshold, remakedb=FALSE) + db.match = unlist(db.match.list) + if (length(which(duplicated(db.match))) > 0) { + db.match = db.match[-which(duplicated(db.match))] + } + } else { + + # Parse words in query and find match in db for each words + query.words = gsub('([[:upper:]][[:lower:]])', ' \\1', query) + query.words = as.vector(str_split(query.words, regex("[[:punct:] ]+"), simplify = TRUE)) + query.words = query.words[query.words != ""] + query.occ = matrix(rep(0, nrow(tfidf.table)), ncol = nrow(tfidf.table), nrow = 1) + colnames(query.occ) = rownames(tfidf.table) + for (i in query.words) { + word.reg = paste0("^", i, "s?$") + query.occ[1,str_detect(colnames(query.occ), regex(word.reg, ignore_case = TRUE))] = 1 + } + + # Compute cell type's score based on words match and words tfidf + ct.score = query.occ %*% tfidf.table + db.match = colnames(ct.score)[ct.score[1,] >= threshold] + + # Try to find matches for a concatened form of the full query + query.occ = matrix(rep(0, nrow(tfidf.table)), ncol = nrow(tfidf.table), nrow = 1) + colnames(query.occ) = rownames(tfidf.table) + query.reg = paste0(c("^", query.words, "s?$"), collapse = "") + query.occ[1,str_detect(colnames(query.occ), regex(query.reg, ignore_case = TRUE))] = 1 + ct.score = query.occ %*% tfidf.table + db.match.word = colnames(ct.score)[ct.score[1,] > 0.5] + db.match = union(db.match, db.match.word) + if (length(db.match) == 0) { + print(paste0("No match for query: ", query)) + } + } + return(db.match) +}