diff --git a/R/ctQueryReg.R b/R/ctQueryReg.R new file mode 100644 index 0000000..564ddbd --- /dev/null +++ b/R/ctQueryReg.R @@ -0,0 +1,66 @@ +#' Function to parse from a character vector all character matching all keywords from a query. +#' +#' @param query character or character vector. +#' @param db a character vector to parse query from. If remakedb=FALSE it should be a matrix with each extracted word of db as second column and with corresonding full character of db as first column. +#' @param remakedb a boolean value, default is TRUE. Put to FALSE (you should not do this) if you can provide a matrix as db as describe in db parameter. +#' @return a character vector with all matches for one query. +#' @details +#' This function will look for matches between a query and a character database using regular expression for text mining. +#' This function is not perfect and require some good practices: +#' - Take look at your database to design your query. +#' - Include both litteral and acronym form (if it exist) of a term in distinct query character. +#' - Avoid using plural forms in query. +#' - Correclty spell your query. +#' - Database will not always be clean, and concatenated words (for instance: forinstance) +#' will not be detected but with a specific corresponding concatened query. +#' @export +#' +ctQueryReg = function(query, db, remakedb=TRUE) { + # Parse words in db if needed + if (remakedb) { + new_db = matrix(ncol = 2) + for (i in db) { + ibis = gsub('([[:upper:]][[:lower:]])', ' \\1', i) + words = as.vector(str_split(ibis, regex("[[:punct:] ]+"), simplify = TRUE)) + one_ct = matrix(data = cbind(rep(i,length(words)), words), ncol=2, nrow = length(words)) + new_db = rbind(new_db, one_ct) + } + colnames(new_db) = c("book", "word") + new_db = as.data.frame(new_db[-c(1,which(new_db[,2] == "")),]) + } else { + new_db = db + } + # If user provides several queries for one db, apply this function for each query + if (length(query) > 1) { + db.match.list = apply(as.array(query), MARGIN = 1, FUN = "ctQueryReg", db=new_db, remakedb=FALSE) + db.match = unlist(db.match.list) + if (length(which(duplicated(db.match))) > 0) { + db.match = db.match[-which(duplicated(db.match))] + } + } else { + # Parse words in query + query.words = gsub('([[:upper:]][[:lower:]])', ' \\1', query) + query.words = as.vector(str_split(query.words, regex("[[:punct:] ]+"), simplify = TRUE)) + query.words = query.words[query.words != ""] + db.match = c() + # For each words in query, detect matching characters in db and + # Keep intersect with mactches of other words + for (i in 1:length(query.words)) { + query.reg = paste0("^", query.words[i], "s?$") + db.match.word = new_db[str_detect(new_db[,2], regex(query.reg, ignore_case = TRUE)), 1] + if (i == 1) { + db.match = db.match.word + } else { + db.match = intersect(db.match, db.match.word) + } + } + # Try to find matches for a concatened form of the full query + query.reg = paste0(c("^", query.words, "S?$"), collapse = "") + db.match.word = new_db[str_detect(new_db[,2], regex(query.reg, ignore_case = TRUE)), 1] + db.match = union(db.match, db.match.word) + if (length(db.match) == 0) { + print(paste0("No match for query: ", query)) + } + } + return(db.match) +} diff --git a/R/ctQueryTFIDF.R b/R/ctQueryTFIDF.R new file mode 100644 index 0000000..592c37e --- /dev/null +++ b/R/ctQueryTFIDF.R @@ -0,0 +1,100 @@ +#' Function to parse from a character vector all character matching keywords from a query using TF-IDF metric. +#' +#' @param query character or character vector. +#' @param db a character vector to parse query from. If remakedb=FALSE it should be a matrix with each extracted word of db as second column and with corresonding full character of db as first column. +#' @param remakedb a boolean value, default is TRUE. Put to FALSE (you should not do this) if you can provide a matrix as db as describe in db parameter. +#' @return a character vector with all matches for one query. +#' @details +#' This function will look for matches between a query and a character database using TF-IDF for text mining. +#' This function is not perfect and require some good practices: +#' - Take look at your database to design your query. +#' - Include both litteral and acronym form (if it exist) of a term in distinct query character. +#' - Avoid using plural forms in query. +#' - Correclty spell your query. +#' - Database will not always be clean, and concatenated words (for instance: forinstance) +#' will not be detected but with a specific corresponding concatened query. +#' @export +#' +ctQueryTFIDF = function(query, db, threshold=1, remakedb=TRUE) { + + # Parse words in db if needed + if (remakedb) { + new_db = matrix(ncol = 2) + for (i in db) { + ibis = gsub('([[:upper:]][[:lower:]])', ' \\1', i) + words = as.vector(str_split(ibis, regex("[[:punct:] ]+"), simplify = TRUE)) + one_ct = matrix(data = cbind(rep(i,length(words)), words), ncol=2, nrow = length(words)) + new_db = rbind(new_db, one_ct) + } + colnames(new_db) = c("book", "word") + new_db = as.data.frame(new_db[-c(1,which(new_db[,2] == "")),]) + + # Homogenize word spelling: + for (wd in unique(new_db[,2])) { + word.reg = paste0("^", wd, "s?$") + db.match.word = str_detect(new_db[,2], regex(word.reg, ignore_case = TRUE)) + # May need optimazation + new_db[which(db.match.word),2] = wd + } + + # Compute word count and tf_idf + new_db = new_db %>% count(book, word) + total_words = new_db %>% + group_by(book) %>% + summarize(total = sum(n)) + new_db = left_join(new_db, total_words) + new_db = new_db %>% + bind_tf_idf(word, book, n) + + # Create books x words tf_idf matrix + db.words = unique(new_db$word) + db.names = unique(new_db$book) + tfidf.table = matrix(0L, nrow = length(db.words), ncol = length(db.names)) + colnames(tfidf.table) = sort(db.names) + rownames(tfidf.table) = sort(db.words) + for (i in 1:nrow(new_db)) { + tfidf.table[as.vector(new_db$word[i]), as.vector(new_db$book[i])] = new_db$tf_idf[i] + } + + } else { + tfidf.table = db + } + # If user provides several queries for one db, apply this function for each query + if (length(query) > 1) { + db.match.list = apply(as.array(query), MARGIN = 1, FUN = "ctQueryTFIDF", + db=tfidf.table, threshold=threshold, remakedb=FALSE) + db.match = unlist(db.match.list) + if (length(which(duplicated(db.match))) > 0) { + db.match = db.match[-which(duplicated(db.match))] + } + } else { + + # Parse words in query and find match in db for each words + query.words = gsub('([[:upper:]][[:lower:]])', ' \\1', query) + query.words = as.vector(str_split(query.words, regex("[[:punct:] ]+"), simplify = TRUE)) + query.words = query.words[query.words != ""] + query.occ = matrix(rep(0, nrow(tfidf.table)), ncol = nrow(tfidf.table), nrow = 1) + colnames(query.occ) = rownames(tfidf.table) + for (i in query.words) { + word.reg = paste0("^", i, "s?$") + query.occ[1,str_detect(colnames(query.occ), regex(word.reg, ignore_case = TRUE))] = 1 + } + + # Compute cell type's score based on words match and words tfidf + ct.score = query.occ %*% tfidf.table + db.match = colnames(ct.score)[ct.score[1,] >= threshold] + + # Try to find matches for a concatened form of the full query + query.occ = matrix(rep(0, nrow(tfidf.table)), ncol = nrow(tfidf.table), nrow = 1) + colnames(query.occ) = rownames(tfidf.table) + query.reg = paste0(c("^", query.words, "s?$"), collapse = "") + query.occ[1,str_detect(colnames(query.occ), regex(query.reg, ignore_case = TRUE))] = 1 + ct.score = query.occ %*% tfidf.table + db.match.word = colnames(ct.score)[ct.score[1,] > 0.5] + db.match = union(db.match, db.match.word) + if (length(db.match) == 0) { + print(paste0("No match for query: ", query)) + } + } + return(db.match) +} diff --git a/R/dataProject.R b/R/dataProject.R index c9323c6..09bbbde 100644 --- a/R/dataProject.R +++ b/R/dataProject.R @@ -1,3 +1,4 @@ +source("panelProjection.R") #' Compute Reference Component features for clustering analysis #' #' @param rca.obj RCA object. @@ -6,10 +7,17 @@ #' @param corMeth Any of the correlation measures supported by R, defaults to pearson #' @param power power to raise up to for the RCA features before clustering, default is 4 #' @param scale True if the data should be scaled, False otherwise +#' @param ctSelection A character vector of cell types if not NULL. Only selected cell types will be kept in projection. #' @return RCA object. #' @export #' -dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corMeth = "pearson", power = 4, scale = T) { +dataProject <- function(rca.obj, + method = "GlobalPanel", + customPath = NULL, + corMeth = "pearson", + power = 4, + scale = T, + ctSelection = NULL) { # Extract data sc_data <- rca.obj$data @@ -22,45 +30,15 @@ dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corM # For each fragment of the Global Panel for (i in 1:length(ReferencePanel[[1]])) { - - # Initialise panel - panel = ReferencePanel[[1]][[i]] - - # Select genes that are shared by the input data and the panel - shared_genes <- intersect(rownames(sc_data), rownames(panel)) - - # Reduce the panel and input data to the shared genes - subset_panel = panel[shared_genes, ] - subset_data = sc_data[shared_genes, , drop = FALSE] - - # For values in the panel below the minimum threshold, set those values to threshold - subset_panel[subset_panel <= (ReferencePanel$at)[i]] = (ReferencePanel$at)[i] - - # Compute projection of input data with the panel fragment - if(corMeth == "pearson") { - subset_panel = as.matrix(subset_panel) - projection_fragment <- qlcMatrix::corSparse(X = subset_panel, Y = subset_data) - } else { - projection_fragment <- cor(subset_panel, subset_data, method = corMeth) - } - - - # Reattach dimnames - colnames(projection_fragment) <- colnames(subset_data) - rownames(projection_fragment) <- colnames(subset_panel) - - # Raise the projection fragment to power - projection_fragment = abs(projection_fragment) ^ (power) * sign(projection_fragment) - - # If scaling is required - if (scale) { - - # Scale - projection_fragment = scale(projection_fragment, center = TRUE, scale = TRUE) - } - + # Store projection data of fragment of Global Panel - projection_list[[i]] = projection_fragment + projection_list[[i]] = panelProjection(sc_data, + ReferencePanel[[1]][[i]], + corMeth=corMeth, + power=power, scale=scale, + apply_threshold=TRUE, + threshold=(ReferencePanel$at)[i], + ctSelection = ctSelection) } # Combine the projection result of multiple Global Panel fragments @@ -70,59 +48,42 @@ dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corM # If panel for correlation is ColonEpitheliumPanel else if (method == "ColonEpitheliumPanel") { + panel = ReferencePanel$ColonEpiPanel + # Convert rownames to gene symbol and sum duplicates + gene.names = as.character(str_extract_all(rownames(panel), "_.+_")) + gene.names = str_sub(gene.names, 2, -2) + panel = cbind(panel, gene.names) + dup.genes = unique(gene.names[duplicated(gene.names)]) + for (gene in dup.genes) { + sub.panel = panel[which(gene.names == gene),1:(ncol(panel) - 1)] + new.row = apply(sub.panel, MARGIN = 2, FUN = "sum") + for (i in which(gene.names == gene)) { + panel[i, 1:(ncol(panel) - 1)] = new.row + } + } + panel = panel[-which(duplicated(panel$gene.names)),] + rownames(panel) = panel$gene.names + panel = panel[,-ncol(panel)] # Scale panel by median - fc = apply(ReferencePanel$ColonEpiPanel, 1, function(x) x - median(x)) - + fc = apply(panel, 1, function(x) x - median(x)) fs = fc > 1.5 - - fs1 = rownames(ReferencePanel$ColonEpiPanel[apply(fs, 1, function(x) - sum(x)) > 0,]) - gl_intersect = intersect(rownames(fpkm_temp), fs1) - projection = as.data.frame(cor(fpkm_temp[gl_intersect,], ReferencePanel$ColonEpiPanel[gl_intersect,], corMeth)) - projection = abs(projection) ^ (power) * sign(projection) - if (scale) { - projection = scale(projection, - center = TRUE, - scale = TRUE) - } + panel = panel[apply(fs, 1, function(x) + sum(x)) > 0,] + # Store projection data + projection= panelProjection(sc_data, panel, corMeth=corMeth, + power=power, scale=scale, + ctSelection = ctSelection) } + # If any other panel is chosen else if (method %in% names(ReferencePanel)) { panel <- ReferencePanel[[method]] - # Initialise variable to store projection data from the two fragments of the Global Panel - projection_list = list() - - # Select genes that are shared by the input data and the panel - shared_genes <- intersect(rownames(sc_data), rownames(panel)) - - # Reduce the panel and input data to the shared genes - subset_panel = panel[shared_genes, ] - subset_data = sc_data[shared_genes, , drop = FALSE] - - # Compute projection of input data with the panel - if(corMeth == "pearson") { - subset_panel = as.matrix(subset_panel) - projection <- qlcMatrix::corSparse(X = subset_panel, Y = subset_data) - } else { - projection <- cor(subset_panel, subset_data, method = corMeth) - } - rownames(projection) <- colnames(subset_panel) - colnames(projection) <- colnames(subset_data) - - # Raise the projection to power - projection = abs(projection) ^ (power) * sign(projection) - - # If scaling is required - if (scale) { - - # Scale - projection = scale(projection, - center = TRUE, - scale = TRUE) - } - + # Store projection data + projection= panelProjection(sc_data, panel, corMeth=corMeth, + power=power, scale=scale, + ctSelection = ctSelection) } # If no provided method is chosen, it is assumed that the user wishes to use a custom panel @@ -130,37 +91,17 @@ dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corM # Load panel from path provided panel <- readRDS(customPath) - - # Select genes that are shared by the input data and the panel - shared_genes <- intersect(rownames(sc_data), rownames(panel)) - - # Reduce the panel and input data to the shared genes - subset_panel = panel[shared_genes, ] - subset_data = sc_data[shared_genes, , drop = FALSE] - - # Compute projection of input data with the panel - if(corMeth == "pearson") { - subset_panel = as.matrix(subset_panel) - projection <- qlcMatrix::corSparse(X = subset_panel, Y = subset_data) - } else { - projection <- cor(subset_panel, subset_data, method = corMeth) - } - rownames(projection) <- colnames(subset_panel) - colnames(projection) <- colnames(subset_data) - - # Raise the projection to power - projection = abs(projection) ^ (power) * sign(projection) - - # If scaling is required - if (scale) { - - # Scale - projection = scale(projection, - center = TRUE, - scale = TRUE) - } + + # Store projection data + projection = panelProjection(sc_data, panel, corMeth=corMeth, + power=power, scale=scale, + ctSelection = ctSelection) + } + + if (is.null(projection)) { + print("Not any projection was succesfull for this panel.") + return(NULL) } - # Store projection result as Matrix projection = as.matrix(projection) projection = as(projection, "dgCMatrix") @@ -172,3 +113,4 @@ dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corM return(rca.obj) } + \ No newline at end of file diff --git a/R/multiRCAPanel.R b/R/multiRCAPanel.R new file mode 100644 index 0000000..f16981a --- /dev/null +++ b/R/multiRCAPanel.R @@ -0,0 +1,38 @@ +#' Function to combine base reference panels for one projection. +#' +#' @param rca.obj RCA object. +#' @param panel.name Vector of methods from dataProject to select panels ("GlobalPanel", "ColonEpitheliumPanel", "ENCODEPanel" and "Custom"). +#' @param customPath Character vector with corresponding customPath to "Custom" panel name for dataProject. +#' @param corMeth Character vector with corresponding customPath to "Custom" panel name for dataProject. +#' @param power Numeric vector for corresponding power parameter to pass to dataProject for each method in panel.name. +#' @param scale Boolean vector for corresponding scale parameter to pass to dataProject for each method in panel.name. +#' @param ctSelection List of character vector for corresponding ctSelect parameter to pass to dataProject for each method in panel.name. +#' +#' @export +#' +multiRCAPanel = function(rca.obj, + panel.name=c("GlobalPanel", "ENCODEPanel", "ColonEpitheliumPanel"), + customPath = rep(NULL, length(panel.name)), + corMeth = rep("pearson", length(panel.name)), + power = rep(4, length(panel.name)), + scale = rep(TRUE, length(panel.name)), + ctSelection = rep(NULL, length(panel.name))) { + + full.projection = list() + for (i in 1:length(panel.name)) { + panel = panel.name[i] + print(panel) + if (panel %in% c("GlobalPanel", "ENCODEPanel", "ColonEpitheliumPanel", "Custom")) { + dataProject(rca.obj = rca.obj, method = panel, + corMeth = corMeth[i], power = power[i], + scale = scale[i], ctSelection = ctSelection[[i]]) + full.projection[[panel]] = rca.obj$projection.data + } else { + print("panel.name must be methods from dataProject().") + } + } + + rca.obj$projection.data = do.call("rbind", full.projection) + return(rca.obj) +} + diff --git a/R/panelProjection.R b/R/panelProjection.R new file mode 100644 index 0000000..820ab88 --- /dev/null +++ b/R/panelProjection.R @@ -0,0 +1,73 @@ +source("ctQueryReg.R") + +#' Compute Reference Component features for clustering analysis +#' +#' @param sc_data data field from an RCA object. +#' @param panel Reference panel as a matrix +#' @param corMeth Any of the correlation measures supported by R, defaults to pearson +#' @param power power to raise up to for the RCA features before clustering, default is 4 +#' @param scale True if the data should be scaled, False otherwise +#' @param apply_threshold True if a threshold to raise the lowest expression values needs to be applied on panel data, False otherwise. +#' @param threshold Minimal wished expression value for panel data. All lower expression values in panel will be raised to this value if apply_threshold is TRUE. +#' @return RCA object. +#' @export +#' + +panelProjection = function(sc_data, + panel, + corMeth="pearson", + power=4, + scale=TRUE, + apply_threshold=FALSE, + threshold=NULL, + ctSelection = NULL) { + + # Apply selection of cell types + if (!is.null(ctSelection)) { + selected.ct = ctQueryReg(ctSelection, colnames(panel)) + if (length(selected.ct) != 0) { + genes.names = rownames(panel) + panel = as.matrix(panel[,selected.ct]) + rownames(panel) = genes.names + colnames(panel) = selected.ct + } else { + print("No match found for this ctSelection, no projection was done.") + return(NULL) + } + } + + # Select genes that are shared by the input data and the panel + shared_genes <- intersect(rownames(sc_data), rownames(panel)) + print(paste0("Projection on ", length(shared_genes), " genes.")) + + # Reduce the panel and input data to the shared genes + subset_panel = as.matrix(panel[shared_genes, ]) + colnames(subset_panel) = colnames(panel) + subset_data = sc_data[shared_genes, , drop = FALSE] + + # For values in the panel below the minimum threshold, set those values to threshold + if (apply_threshold) { + subset_panel[subset_panel <= threshold] = threshold + } + # Compute projection of input data with the panel + if(corMeth == "pearson") { + subset_panel = as.matrix(subset_panel) + projection <- qlcMatrix::corSparse(X = subset_panel, Y = subset_data) + } else { + projection <- cor(subset_panel, subset_data, method = corMeth) + } + rownames(projection) <- colnames(subset_panel) + colnames(projection) <- colnames(subset_data) + + # Raise the projection to power + projection = abs(projection) ^ (power) * sign(projection) + + # If scaling is required + if (scale) { + # Scale + projection = scale(projection, + center = TRUE, + scale = TRUE) + } + return(projection) +}