prabhakarlab · leprohonmalo · Feb 11, 2020 · Feb 11, 2020 · Feb 11, 2020 · Feb 13, 2020
diff --git a/R/ctQueryReg.R b/R/ctQueryReg.R
@@ -0,0 +1,66 @@
+#' Function to parse from a character vector all character matching all keywords from a query.
+#' 
+#' @param query character or character vector.
+#' @param db a character vector to parse query from. If remakedb=FALSE it should be a matrix with each extracted word of db as second column and with corresonding full character of db as first column.
+#' @param remakedb a boolean value, default is TRUE. Put to FALSE (you should not do this) if you can provide a matrix as db as describe in db parameter.  
+#' @return a character vector with all matches for one query.
+#' @details 
+#' This function will look for matches between a query and a character database using regular expression for text mining.
+#' This function is not perfect and require some good practices:
+#' - Take look at your database to design your query.
+#' - Include both litteral and acronym form (if it exist) of a term in distinct query character.
+#' - Avoid using plural forms in query.
+#' - Correclty spell your query.
+#' - Database will not always be clean, and concatenated words (for instance: forinstance)
+#' will not be detected but with a specific corresponding concatened query.
+#' @export
+#' 
+ctQueryReg = function(query, db, remakedb=TRUE) {
+    # Parse words in db if needed
+    if (remakedb) {
+        new_db = matrix(ncol = 2)
+        for (i in db) {
+            ibis = gsub('([[:upper:]][[:lower:]])', ' \\1', i)
+            words = as.vector(str_split(ibis, regex("[[:punct:] ]+"), simplify = TRUE))
+            one_ct = matrix(data = cbind(rep(i,length(words)), words), ncol=2, nrow = length(words))
+            new_db = rbind(new_db, one_ct)
+        }
+        colnames(new_db) = c("book", "word")
+        new_db = as.data.frame(new_db[-c(1,which(new_db[,2] == "")),])
+    } else {
+        new_db = db
+    }
+    # If user provides several queries for one db, apply this function for each query
+    if (length(query) > 1) {
+        db.match.list = apply(as.array(query), MARGIN = 1, FUN = "ctQueryReg", db=new_db, remakedb=FALSE)
+        db.match = unlist(db.match.list)
+        if (length(which(duplicated(db.match))) > 0) {
+            db.match = db.match[-which(duplicated(db.match))]
+        }
+    } else {
+        # Parse words in query
+        query.words = gsub('([[:upper:]][[:lower:]])', ' \\1', query)
+        query.words = as.vector(str_split(query.words, regex("[[:punct:] ]+"), simplify = TRUE))
+        query.words = query.words[query.words != ""]
+        db.match = c()
+        # For each words in query, detect matching characters in db and 
+        # Keep intersect with mactches of other words
+        for (i in 1:length(query.words)) {
+            query.reg = paste0("^", query.words[i], "s?$")
+            db.match.word = new_db[str_detect(new_db[,2], regex(query.reg, ignore_case = TRUE)), 1]
+            if (i == 1) {
+                db.match = db.match.word
+            } else {
+                db.match = intersect(db.match, db.match.word)
+            }
+        }
+        # Try to find matches for a concatened form of the full query
+        query.reg = paste0(c("^", query.words, "S?$"), collapse = "")
+        db.match.word = new_db[str_detect(new_db[,2], regex(query.reg, ignore_case = TRUE)), 1]
+        db.match = union(db.match, db.match.word)
+        if (length(db.match) == 0) {
+            print(paste0("No match for query: ", query))
+        } 
+    } 
+    return(db.match)
+}
diff --git a/R/ctQueryTFIDF.R b/R/ctQueryTFIDF.R
@@ -0,0 +1,100 @@
+#' Function to parse from a character vector all character matching keywords from a query using TF-IDF metric.
+#' 
+#' @param query character or character vector.
+#' @param db a character vector to parse query from. If remakedb=FALSE it should be a matrix with each extracted word of db as second column and with corresonding full character of db as first column.
+#' @param remakedb a boolean value, default is TRUE. Put to FALSE (you should not do this) if you can provide a matrix as db as describe in db parameter.  
+#' @return a character vector with all matches for one query.
+#' @details 
+#' This function will look for matches between a query and a character database using TF-IDF for text mining.
+#' This function is not perfect and require some good practices:
+#' - Take look at your database to design your query.
+#' - Include both litteral and acronym form (if it exist) of a term in distinct query character.
+#' - Avoid using plural forms in query.
+#' - Correclty spell your query.
+#' - Database will not always be clean, and concatenated words (for instance: forinstance)
+#' will not be detected but with a specific corresponding concatened query.
+#' @export
+#' 
+ctQueryTFIDF = function(query, db, threshold=1, remakedb=TRUE) {
+
+    # Parse words in db if needed
+    if (remakedb) {
+        new_db = matrix(ncol = 2)
+        for (i in db) {
+            ibis = gsub('([[:upper:]][[:lower:]])', ' \\1', i)
+            words = as.vector(str_split(ibis, regex("[[:punct:] ]+"), simplify = TRUE))
+            one_ct = matrix(data = cbind(rep(i,length(words)), words), ncol=2, nrow = length(words))
+            new_db = rbind(new_db, one_ct)
+        }
+        colnames(new_db) = c("book", "word")
+        new_db = as.data.frame(new_db[-c(1,which(new_db[,2] == "")),])
+
+        # Homogenize word spelling:
+        for (wd in unique(new_db[,2])) {
+            word.reg = paste0("^", wd, "s?$")
+            db.match.word = str_detect(new_db[,2], regex(word.reg, ignore_case = TRUE))
+            # May need optimazation
+            new_db[which(db.match.word),2] = wd
+        }
+
+        # Compute word count and tf_idf
+        new_db = new_db %>% count(book, word)
+        total_words = new_db %>% 
+            group_by(book) %>% 
+            summarize(total = sum(n))
+        new_db = left_join(new_db, total_words)
+        new_db = new_db %>%
+            bind_tf_idf(word, book, n)    
+
+        # Create books x words tf_idf matrix
+        db.words = unique(new_db$word)
+        db.names = unique(new_db$book)
+        tfidf.table = matrix(0L, nrow = length(db.words), ncol = length(db.names))
+        colnames(tfidf.table) = sort(db.names)
+        rownames(tfidf.table) = sort(db.words)
+        for (i in 1:nrow(new_db)) {
+            tfidf.table[as.vector(new_db$word[i]), as.vector(new_db$book[i])] = new_db$tf_idf[i]
+        }
+
+    } else {
+        tfidf.table = db
+    }
+    # If user provides several queries for one db, apply this function for each query
+    if (length(query) > 1) {
+        db.match.list = apply(as.array(query), MARGIN = 1, FUN = "ctQueryTFIDF",
+                              db=tfidf.table, threshold=threshold, remakedb=FALSE)
+        db.match = unlist(db.match.list)
+        if (length(which(duplicated(db.match))) > 0) {
+            db.match = db.match[-which(duplicated(db.match))]
+        }
+    } else {
+
+        # Parse words in query and find match in db for each words
+        query.words = gsub('([[:upper:]][[:lower:]])', ' \\1', query)
+        query.words = as.vector(str_split(query.words, regex("[[:punct:] ]+"), simplify = TRUE))
+        query.words = query.words[query.words != ""]
+        query.occ = matrix(rep(0, nrow(tfidf.table)), ncol = nrow(tfidf.table), nrow = 1)
+        colnames(query.occ) = rownames(tfidf.table)
+        for (i in query.words) {
+            word.reg = paste0("^", i, "s?$")
+            query.occ[1,str_detect(colnames(query.occ), regex(word.reg, ignore_case = TRUE))] = 1
+        }
+
+        # Compute cell type's score based on words match and words tfidf 
+        ct.score = query.occ %*% tfidf.table
+        db.match = colnames(ct.score)[ct.score[1,] >= threshold]
+
+        # Try to find matches for a concatened form of the full query
+        query.occ = matrix(rep(0, nrow(tfidf.table)), ncol = nrow(tfidf.table), nrow = 1)
+        colnames(query.occ) = rownames(tfidf.table)
+        query.reg = paste0(c("^", query.words, "s?$"), collapse = "")
+        query.occ[1,str_detect(colnames(query.occ), regex(query.reg, ignore_case = TRUE))] = 1
+        ct.score = query.occ %*% tfidf.table
+        db.match.word = colnames(ct.score)[ct.score[1,] > 0.5]
+        db.match = union(db.match, db.match.word)
+        if (length(db.match) == 0) {
+            print(paste0("No match for query: ", query))
+        } 
+    } 
+    return(db.match)
+}
diff --git a/R/dataProject.R b/R/dataProject.R
@@ -1,3 +1,4 @@
+source("panelProjection.R")
 #' Compute Reference Component features for clustering analysis
 #'
 #' @param rca.obj RCA object.
@@ -6,10 +7,17 @@
 #' @param corMeth Any of the correlation measures supported by R, defaults to pearson
 #' @param power power to raise up to for the RCA features before clustering, default is 4
 #' @param scale True if the data should be scaled, False otherwise
+#' @param ctSelection A character vector of cell types if not NULL. Only selected cell types will be kept in projection. 
 #' @return RCA object.
 #' @export
 #'
-dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corMeth = "pearson", power = 4, scale = T) {
+dataProject <- function(rca.obj,
+                        method = "GlobalPanel",
+                        customPath = NULL,
+                        corMeth = "pearson",
+                        power = 4,
+                        scale = T,
+                        ctSelection = NULL) {
 
     # Extract data
     sc_data <- rca.obj$data
@@ -22,45 +30,15 @@ dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corM
 
         # For each fragment of the Global Panel
         for (i in 1:length(ReferencePanel[[1]])) {
-
-            # Initialise panel
-            panel = ReferencePanel[[1]][[i]]
-
-            # Select genes that are shared by the input data and the panel
-            shared_genes <- intersect(rownames(sc_data), rownames(panel))
-
-            # Reduce the panel and input data to the shared genes
-            subset_panel = panel[shared_genes, ]
-            subset_data = sc_data[shared_genes, , drop = FALSE]
-
-            # For values in the panel below the minimum threshold, set those values to threshold
-            subset_panel[subset_panel <= (ReferencePanel$at)[i]] = (ReferencePanel$at)[i]
-
-            # Compute projection of input data with the panel fragment
-            if(corMeth == "pearson") {
-                subset_panel = as.matrix(subset_panel)
-                projection_fragment <- qlcMatrix::corSparse(X = subset_panel, Y = subset_data)
-            } else {
-                projection_fragment <- cor(subset_panel, subset_data, method = corMeth)
-            }
-
-
-            # Reattach dimnames
-            colnames(projection_fragment) <- colnames(subset_data)
-            rownames(projection_fragment) <- colnames(subset_panel)
-
-            # Raise the projection fragment to power
-            projection_fragment = abs(projection_fragment) ^ (power) * sign(projection_fragment)
-
-            # If scaling is required
-            if (scale) {
-
-                # Scale
-                projection_fragment = scale(projection_fragment, center = TRUE, scale = TRUE)
-            }
-
+
             # Store projection data of fragment of Global Panel
-            projection_list[[i]] = projection_fragment
+            projection_list[[i]] = panelProjection(sc_data,
+                                                   ReferencePanel[[1]][[i]],
+                                                   corMeth=corMeth,
+                                                   power=power, scale=scale, 
+                                                   apply_threshold=TRUE, 
+                                                   threshold=(ReferencePanel$at)[i],
+                                                   ctSelection = ctSelection)
         }
 
         # Combine the projection result of multiple Global Panel fragments
@@ -70,97 +48,60 @@ dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corM
     # If panel for correlation is ColonEpitheliumPanel
     else if (method == "ColonEpitheliumPanel") {
 
+        panel = ReferencePanel$ColonEpiPanel
+        # Convert rownames to gene symbol and sum duplicates
+        gene.names = as.character(str_extract_all(rownames(panel), "_.+_"))
+        gene.names = str_sub(gene.names, 2, -2)
+        panel = cbind(panel, gene.names)
+        dup.genes = unique(gene.names[duplicated(gene.names)])
+        for (gene in dup.genes) {
+            sub.panel = panel[which(gene.names == gene),1:(ncol(panel) - 1)]
+            new.row = apply(sub.panel, MARGIN = 2, FUN = "sum")
+            for (i in which(gene.names == gene)) {
+                panel[i, 1:(ncol(panel) - 1)] = new.row
+            }
+        }
+        panel = panel[-which(duplicated(panel$gene.names)),]
+        rownames(panel) = panel$gene.names
+        panel = panel[,-ncol(panel)]
         # Scale panel by median
-        fc = apply(ReferencePanel$ColonEpiPanel, 1, function(x) x - median(x))
-
+        fc = apply(panel, 1, function(x) x - median(x))
         fs = fc > 1.5
-
-        fs1 = rownames(ReferencePanel$ColonEpiPanel[apply(fs, 1, function(x)
-            sum(x)) > 0,])
-        gl_intersect = intersect(rownames(fpkm_temp), fs1)
-        projection = as.data.frame(cor(fpkm_temp[gl_intersect,], ReferencePanel$ColonEpiPanel[gl_intersect,], corMeth))
-        projection = abs(projection) ^ (power) * sign(projection)
-        if (scale) {
-            projection = scale(projection,
-                               center = TRUE,
-                               scale = TRUE)
-        }
+        panel = panel[apply(fs, 1, function(x)
+            sum(x)) > 0,]
+        # Store projection data
+        projection= panelProjection(sc_data, panel, corMeth=corMeth,
+                                    power=power, scale=scale,
+                                    ctSelection = ctSelection)
     }
+
     # If any other panel is chosen
     else if (method %in% names(ReferencePanel)) {
 
         panel <- ReferencePanel[[method]]
 
-        # Initialise variable to store projection data from the two fragments of the Global Panel
-        projection_list = list()
-
-        # Select genes that are shared by the input data and the panel
-        shared_genes <- intersect(rownames(sc_data), rownames(panel))
-
-        # Reduce the panel and input data to the shared genes
-        subset_panel = panel[shared_genes, ]
-        subset_data = sc_data[shared_genes, , drop = FALSE]
-
-        # Compute projection of input data with the panel
-        if(corMeth == "pearson") {
-            subset_panel = as.matrix(subset_panel)
-            projection <- qlcMatrix::corSparse(X = subset_panel, Y = subset_data)
-        } else {
-            projection <- cor(subset_panel, subset_data, method = corMeth)
-        }
-        rownames(projection) <- colnames(subset_panel)
-        colnames(projection) <- colnames(subset_data)
-
-        # Raise the projection to power
-        projection = abs(projection) ^ (power) * sign(projection)
-
-        # If scaling is required
-        if (scale) {
-
-            # Scale
-            projection = scale(projection,
-                               center = TRUE,
-                               scale = TRUE)
-        }
-
+        # Store projection data
+        projection= panelProjection(sc_data, panel, corMeth=corMeth,
+                                    power=power, scale=scale,
+                                    ctSelection = ctSelection)
     }
 
     # If no provided method is chosen, it is assumed that the user wishes to use a custom panel
     else {
 
         # Load panel from path provided
         panel <- readRDS(customPath)
-
-        # Select genes that are shared by the input data and the panel
-        shared_genes <- intersect(rownames(sc_data), rownames(panel))
-
-        # Reduce the panel and input data to the shared genes
-        subset_panel = panel[shared_genes, ]
-        subset_data = sc_data[shared_genes, , drop = FALSE]
-
-        # Compute projection of input data with the panel
-        if(corMeth == "pearson") {
-            subset_panel = as.matrix(subset_panel)
-            projection <- qlcMatrix::corSparse(X = subset_panel, Y = subset_data)
-        } else {
-            projection <- cor(subset_panel, subset_data, method = corMeth)
-        }
-        rownames(projection) <- colnames(subset_panel)
-        colnames(projection) <- colnames(subset_data)
-
-        # Raise the projection to power
-        projection = abs(projection) ^ (power) * sign(projection)
-
-        # If scaling is required
-        if (scale) {
-
-            # Scale
-            projection = scale(projection,
-                               center = TRUE,
-                               scale = TRUE)
-        }
+
+        # Store projection data
+        projection = panelProjection(sc_data, panel, corMeth=corMeth,
+                                    power=power, scale=scale,
+                                    ctSelection = ctSelection)
+    }
+
+    if (is.null(projection)) {
+        print("Not any projection was succesfull for this panel.")
+        return(NULL)
     }
-
     # Store projection result as Matrix
     projection = as.matrix(projection)
     projection = as(projection, "dgCMatrix")
@@ -172,3 +113,4 @@ dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corM
 
     return(rca.obj)
 }
+