From 7dee9f63971a2a11139d1405d56a8a24e8d0781d Mon Sep 17 00:00:00 2001
From: leprohonmalo <leprohonmalo@gmail.com>
Date: Tue, 11 Feb 2020 09:37:49 +0800
Subject: [PATCH 01/10] Factorized some code from dataProject.R

---
 R/panelProjection.R | 51 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 R/panelProjection.R

diff --git a/R/panelProjection.R b/R/panelProjection.R
new file mode 100644
index 0000000..ea674ae
--- /dev/null
+++ b/R/panelProjection.R
@@ -0,0 +1,51 @@
+#' Compute Reference Component features for clustering analysis
+#'
+#' @param rca.obj RCA object.
+#' @param panel Reference panel as a matrix
+#' @param corMeth Any of the correlation measures supported by R, defaults to pearson
+#' @param power power to raise up to for the RCA features before clustering, default is 4
+#' @param scale True if the data should be scaled, False otherwise
+#' @param apply_threshold True if a threshold to raise the lowest expression values needs to be applied on panel data, False otherwise.
+#' @param threshold Minimal wished expression value for panel data. All lower expression values in panel will be raised to this value if apply_threshold is TRUE. 
+#' @return RCA object.
+#' @export
+#'
+
+panelProjection = function(sc_data, panel, corMeth="pearson", power=4, scale=TRUE, apply_threshold=FALSE, threshold=NULL) {
+    
+    # Select genes that are shared by the input data and the panel
+    shared_genes <- intersect(rownames(sc_data), rownames(panel))
+    print(paste0("Projection on ", length(shared_genes), " genes."))
+    
+    # Reduce the panel and input data to the shared genes
+    subset_panel = panel[shared_genes, ]
+    subset_data = sc_data[shared_genes, , drop = FALSE]
+    
+    # For values in the panel below the minimum threshold, set those values to threshold
+    if (apply_threshold) {
+        subset_panel[subset_panel <= threshold] = threshold
+    }
+    
+    # Compute projection of input data with the panel
+    if(corMeth == "pearson") {
+        subset_panel = as.matrix(subset_panel)
+        projection <- qlcMatrix::corSparse(X = subset_panel, Y = subset_data)
+    } else {
+        projection <- cor(subset_panel, subset_data, method = corMeth)
+    }
+    rownames(projection) <- colnames(subset_panel)
+    colnames(projection) <- colnames(subset_data)
+    
+    # Raise the projection to power
+    projection = abs(projection) ^ (power) * sign(projection)
+    
+    # If scaling is required
+    if (scale) {
+        # Scale
+        projection = scale(projection,
+                           center = TRUE,
+                           scale = TRUE)
+    }
+    
+    return(projection)
+}
\ No newline at end of file

From 7bf4a471e4fe69dc7cc10f4b279bd8dcbbb2c7eb Mon Sep 17 00:00:00 2001
From: leprohonmalo <leprohonmalo@gmail.com>
Date: Tue, 11 Feb 2020 09:41:28 +0800
Subject: [PATCH 02/10] Factorized version of dataProject.R, working on
 implementing ColonEpitheliumPanel

---
 R/dataProject.R | 142 +++++++++---------------------------------------
 1 file changed, 27 insertions(+), 115 deletions(-)

diff --git a/R/dataProject.R b/R/dataProject.R
index 22c3be3..4304cbd 100644
--- a/R/dataProject.R
+++ b/R/dataProject.R
@@ -1,3 +1,5 @@
+source("panelProjection.R")
+
 #' Compute Reference Component features for clustering analysis
 #'
 #' @param rca.obj RCA object.
@@ -30,45 +32,14 @@ dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corM
 
         # For each fragment of the Global Panel
         for (i in 1:length(ReferencePanel[[1]])) {
-
-            # Initialise panel
-            panel = ReferencePanel[[1]][[i]]
-
-            # Select genes that are shared by the input data and the panel
-            shared_genes <- intersect(rownames(sc_data), rownames(panel))
-
-            # Reduce the panel and input data to the shared genes
-            subset_panel = panel[shared_genes, ]
-            subset_data = sc_data[shared_genes, , drop = FALSE]
-
-            # For values in the panel below the minimum threshold, set those values to threshold
-            subset_panel[subset_panel <= (ReferencePanel$at)[i]] = (ReferencePanel$at)[i]
-
-            # Compute projection of input data with the panel fragment
-            if(corMeth == "pearson") {
-                subset_panel = as.matrix(subset_panel)
-                projection_fragment <- qlcMatrix::corSparse(X = subset_panel, Y = subset_data)
-            } else {
-                projection_fragment <- cor(subset_panel, subset_data, method = corMeth)
-            }
-
-
-            # Reattach dimnames
-            colnames(projection_fragment) <- colnames(subset_data)
-            rownames(projection_fragment) <- colnames(subset_panel)
-
-            # Raise the projection fragment to power
-            projection_fragment = abs(projection_fragment) ^ (power) * sign(projection_fragment)
-
-            # If scaling is required
-            if (scale) {
-
-                # Scale
-                projection_fragment = scale(projection_fragment, center = TRUE, scale = TRUE)
-            }
-
+            
             # Store projection data of fragment of Global Panel
-            projection_list[[i]] = projection_fragment
+            projection_list[[i]] = panelProjection(sc_data,
+                                                   ReferencePanel[[1]][[i]],
+                                                   corMeth=corMeth,
+                                                   power=power, scale=scale, 
+                                                   apply_threshold=TRUE, 
+                                                   threshold=(ReferencePanel$at)[i])
         }
 
         # Combine the projection result of multiple Global Panel fragments
@@ -80,63 +51,29 @@ dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corM
         
         # Load reference panel data from environment
         data(ReferencePanel, envir = environment())
-        
+        panel = ReferencePanel$ColonEpiPanel
+        gene.names = as.character(str_extract_all(rownames(panel), "_[[:alnum:]]+_"))
+        rownames(panel) = str_sub(gene.names, 2, -2)
         # Scale panel by median
-        fc = apply(ReferencePanel$ColonEpiPanel, 1, function(x) x - median(x))
-
+        fc = apply(panel, 1, function(x) x - median(x))
         fs = fc > 1.5
-
-        fs1 = rownames(ReferencePanel$ColonEpiPanel[apply(fs, 1, function(x)
-            sum(x)) > 0,])
-        gl_intersect = intersect(rownames(fpkm_temp), fs1)
-        projection = as.data.frame(cor(fpkm_temp[gl_intersect,], ReferencePanel$ColonEpiPanel[gl_intersect,], corMeth))
-        projection = abs(projection) ^ (power) * sign(projection)
-        if (scale) {
-            projection = scale(projection,
-                               center = TRUE,
-                               scale = TRUE)
-        }
+        panel + panel[apply(fs, 1, function(x)
+            sum(x)) > 0,]
+        
+        # Store projection data
+        projection= panelProjection(sc_data, panel, corMeth=corMeth,
+                                    power=power, scale=scale)
     }
+    
     # If panel for correlation is ENCODEPanel
     else if (method == "ENCODEPanel") {
         
         # Load reference panel data from environment
         data("ENCODEPanel", envir = environment())
         
-        panel <- ENCODEPanel
-        
-        # Initialise variable to store projection data from the two fragments of the Global Panel
-        projection_list = list()
-        
-        # Select genes that are shared by the input data and the panel
-        shared_genes <- intersect(rownames(sc_data), rownames(panel))
-        
-        # Reduce the panel and input data to the shared genes
-        subset_panel = panel[shared_genes, ]
-        subset_data = sc_data[shared_genes, , drop = FALSE]
-        
-        # Compute projection of input data with the panel
-        if(corMeth == "pearson") {
-            subset_panel = as.matrix(subset_panel)
-            projection <- qlcMatrix::corSparse(X = subset_panel, Y = subset_data)
-        } else {
-            projection <- cor(subset_panel, subset_data, method = corMeth)
-        }
-        rownames(projection) <- colnames(subset_panel)
-        colnames(projection) <- colnames(subset_data)
-        
-        # Raise the projection to power
-        projection = abs(projection) ^ (power) * sign(projection)
-        
-        # If scaling is required
-        if (scale) {
-            
-            # Scale
-            projection = scale(projection,
-                               center = TRUE,
-                               scale = TRUE)
-        }
-        
+        # Store projection data
+        projection= panelProjection(sc_data, ENCODEPanel, corMeth=corMeth,
+                                    power=power, scale=scale)
     }
     
     # If no provided method is chosen, it is assumed that the user wishes to use a custom panel
@@ -144,35 +81,10 @@ dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corM
 
         # Load panel from path provided
         panel <- readRDS(customPath)
-
-        # Select genes that are shared by the input data and the panel
-        shared_genes <- intersect(rownames(sc_data), rownames(panel))
-
-        # Reduce the panel and input data to the shared genes
-        subset_panel = panel[shared_genes, ]
-        subset_data = sc_data[shared_genes, , drop = FALSE]
-
-        # Compute projection of input data with the panel
-        if(corMeth == "pearson") {
-            subset_panel = as.matrix(subset_panel)
-            projection <- qlcMatrix::corSparse(X = subset_panel, Y = subset_data)
-        } else {
-            projection <- cor(subset_panel, subset_data, method = corMeth)
-        }
-        rownames(projection) <- colnames(subset_panel)
-        colnames(projection) <- colnames(subset_data)
-
-        # Raise the projection to power
-        projection = abs(projection) ^ (power) * sign(projection)
-
-        # If scaling is required
-        if (scale) {
-
-            # Scale
-            projection = scale(projection,
-                               center = TRUE,
-                               scale = TRUE)
-        }
+        
+        # Store projection data
+        projection = panelProjection(sc_data, panel, corMeth=corMeth,
+                                    power=power, scale=scale)
     }
 
     # Store projection result as Matrix

From 04274217da24a58b109162fd0914471f9b366514 Mon Sep 17 00:00:00 2001
From: leprohonmalo <leprohonmalo@gmail.com>
Date: Tue, 11 Feb 2020 14:51:20 +0800
Subject: [PATCH 03/10] Finished to implement ColonEpitheliumPanel in
 dataProject

---
 R/dataProject.R | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/R/dataProject.R b/R/dataProject.R
index 4304cbd..1dc193b 100644
--- a/R/dataProject.R
+++ b/R/dataProject.R
@@ -1,5 +1,5 @@
 source("panelProjection.R")
-
+source("~/Work/RCAv2/R/panelProjection.R")
 #' Compute Reference Component features for clustering analysis
 #'
 #' @param rca.obj RCA object.
@@ -52,14 +52,16 @@ dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corM
         # Load reference panel data from environment
         data(ReferencePanel, envir = environment())
         panel = ReferencePanel$ColonEpiPanel
-        gene.names = as.character(str_extract_all(rownames(panel), "_[[:alnum:]]+_"))
-        rownames(panel) = str_sub(gene.names, 2, -2)
         # Scale panel by median
         fc = apply(panel, 1, function(x) x - median(x))
         fs = fc > 1.5
-        panel + panel[apply(fs, 1, function(x)
+        panel = panel[apply(fs, 1, function(x)
             sum(x)) > 0,]
-        
+        # Convert rownames to gene symbol and remove duplicates
+        gene.names = as.character(str_extract_all(rownames(panel), "_.+_"))
+        gene.names = str_sub(gene.names, 2, -2)
+        panel = panel[-which(duplicated(gene.names)),]
+        rownames(panel) = gene.names[-which(duplicated(gene.names))]
         # Store projection data
         projection= panelProjection(sc_data, panel, corMeth=corMeth,
                                     power=power, scale=scale)

From 844fb22ea9aec0e27ea708e4d8a0b87f05b86c03 Mon Sep 17 00:00:00 2001
From: leprohonmalo <leprohonmalo@gmail.com>
Date: Thu, 13 Feb 2020 10:13:54 +0800
Subject: [PATCH 04/10] Pre-processing of ColonEpitheliumPanel now combines
 duplicates instead of removing them.

---
 R/dataProject.R | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/R/dataProject.R b/R/dataProject.R
index 1dc193b..43e76dd 100644
--- a/R/dataProject.R
+++ b/R/dataProject.R
@@ -52,16 +52,26 @@ dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corM
         # Load reference panel data from environment
         data(ReferencePanel, envir = environment())
         panel = ReferencePanel$ColonEpiPanel
+        # Convert rownames to gene symbol and sum duplicates
+        gene.names = as.character(str_extract_all(rownames(panel), "_.+_"))
+        gene.names = str_sub(gene.names, 2, -2)
+        panel = cbind(panel, gene.names)
+        dup.genes = unique(gene.names[duplicated(gene.names)])
+        for (gene in dup.genes) {
+            sub.panel = panel[which(gene.names == gene),1:(ncol(panel) - 1)]
+            new.row = apply(sub.panel, MARGIN = 2, FUN = "sum")
+            for (i in which(gene.names == gene)) {
+                panel[i, 1:(ncol(panel) - 1)] = new.row
+            }
+        }
+        panel = panel[-which(duplicated(panel$gene.names)),]
+        rownames(panel) = panel$gene.names
+        panel = panel[,-ncol(panel)]
         # Scale panel by median
         fc = apply(panel, 1, function(x) x - median(x))
         fs = fc > 1.5
         panel = panel[apply(fs, 1, function(x)
             sum(x)) > 0,]
-        # Convert rownames to gene symbol and remove duplicates
-        gene.names = as.character(str_extract_all(rownames(panel), "_.+_"))
-        gene.names = str_sub(gene.names, 2, -2)
-        panel = panel[-which(duplicated(gene.names)),]
-        rownames(panel) = gene.names[-which(duplicated(gene.names))]
         # Store projection data
         projection= panelProjection(sc_data, panel, corMeth=corMeth,
                                     power=power, scale=scale)
@@ -100,3 +110,4 @@ dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corM
 
     return(rca.obj)
 }
+        
\ No newline at end of file

From 7a9ed0bf4229a760f332d437cc6fbbd2b23c521f Mon Sep 17 00:00:00 2001
From: leprohonmalo <leprohonmalo@gmail.com>
Date: Mon, 17 Feb 2020 14:47:19 +0800
Subject: [PATCH 05/10] Added ctQueryReg a function to select cell-type (for
 panel)  with a query using regular expression.

---
 R/ctQueryReg.R | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 R/ctQueryReg.R

diff --git a/R/ctQueryReg.R b/R/ctQueryReg.R
new file mode 100644
index 0000000..91003a7
--- /dev/null
+++ b/R/ctQueryReg.R
@@ -0,0 +1,63 @@
+#' Function to parse from a character vector all character matching all keywords from a query.
+#' 
+#' @param query character or character vector.
+#' @param db a character vector to parse query from. If remakedb=FALSE it should be a matrix with each extracted word of db as second column and with corresonding full character of db as first column.
+#' @param remakedb a boolean value, default is TRUE. Put to FALSE (you should not do this) if you can provide a matrix as db as describe in db parameter.  
+#' @return a character vector with all matches for one query.
+#' @details 
+#' This function will look for matches between a query and a character database using regular expression for text mining.
+#' This function is not perfect and require some good practices:
+#' - Take look at your database to design your query.
+#' - Include both litteral and acronym form (if it exist) of a term in distinct query character.
+#' - Avoid using plural forms in query.
+#' - Correclty spell your query.
+#' - Database will not always be clean, and concatenated words (for instance: forinstance)
+#' will not be detected but with a specific corresponding concatened query.
+#' @export
+#' 
+ctQueryReg = function(query, db ,remakedb=TRUE) {
+    # Parse words in db if needed
+    if (remakedb) {
+        new_db = matrix(ncol = 2)
+        for (i in db) {
+            ibis = gsub('([[:upper:]][[:lower:]])', ' \\1', i)
+            words = as.vector(str_split(ibis, regex("[[:punct:] ]+"), simplify = TRUE))
+            one_ct = matrix(data = cbind(rep(i,length(words)), words), ncol=2, nrow = length(words))
+            new_db = rbind(new_db, one_ct)
+        }
+        colnames(new_db) = c("book", "word")
+        new_db = as.data.frame(new_db[-c(1,which(new_db[,2] == "")),])
+    } else {
+        new_db = db
+    }
+    # If user provides several queries for one db, apply this function for each query
+    if (length(query) > 1) {
+        db.match.list = apply(as.array(query), MARGIN = 1, FUN = "ct_query_reg", db=db, remakedb=FALSE)
+        db.match = unlist(db.match.list)
+        if (length(which(duplicated(db.match))) > 0) {
+            db.match = db.match[-which(duplicated(db.match))]
+        }
+    } else {
+        # Parse words in query
+        query.words = gsub('([[:upper:]][[:lower:]])', ' \\1', query)
+        query.words = as.vector(str_split(query.words, regex("[[:punct:] ]+"), simplify = TRUE))
+        query.words = query.words[query.words != ""]
+        db.match = c()
+        # For each words in query, detect matching characters in db and 
+        # Keep intersect with mactches of other words
+        for (i in 1:length(query.words)) {
+            query.reg = paste0("^", query.words[i], "s?$")
+            db.match.word = new_db[str_detect(new_db[,2], regex(query.reg, ignore_case = TRUE)), 1]
+            if (i == 1) {
+                db.match = db.match.word
+            } else {
+                db.match = intersect(db.match, db.match.word)
+            }
+        }
+        # Try to find matches for a concatened form of the full query
+        query.reg = paste0(c("^", query.words, "S?$"), collapse = "")
+        db.match.word = new_db[str_detect(new_db[,2], regex(query.reg, ignore_case = TRUE)), 1]
+        db.match = union(db.match, db.match.word)
+    } 
+    return(db.match)
+}

From bd5cfb055f283f65ba1c2a15165f59122dbc9afe Mon Sep 17 00:00:00 2001
From: leprohonmalo <leprohonmalo@gmail.com>
Date: Tue, 18 Feb 2020 10:13:43 +0800
Subject: [PATCH 06/10] Corrected function's name in apply line

---
 R/ctQueryReg.R | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/R/ctQueryReg.R b/R/ctQueryReg.R
index 91003a7..564ddbd 100644
--- a/R/ctQueryReg.R
+++ b/R/ctQueryReg.R
@@ -15,7 +15,7 @@
 #' will not be detected but with a specific corresponding concatened query.
 #' @export
 #' 
-ctQueryReg = function(query, db ,remakedb=TRUE) {
+ctQueryReg = function(query, db, remakedb=TRUE) {
     # Parse words in db if needed
     if (remakedb) {
         new_db = matrix(ncol = 2)
@@ -32,7 +32,7 @@ ctQueryReg = function(query, db ,remakedb=TRUE) {
     }
     # If user provides several queries for one db, apply this function for each query
     if (length(query) > 1) {
-        db.match.list = apply(as.array(query), MARGIN = 1, FUN = "ct_query_reg", db=db, remakedb=FALSE)
+        db.match.list = apply(as.array(query), MARGIN = 1, FUN = "ctQueryReg", db=new_db, remakedb=FALSE)
         db.match = unlist(db.match.list)
         if (length(which(duplicated(db.match))) > 0) {
             db.match = db.match[-which(duplicated(db.match))]
@@ -58,6 +58,9 @@ ctQueryReg = function(query, db ,remakedb=TRUE) {
         query.reg = paste0(c("^", query.words, "S?$"), collapse = "")
         db.match.word = new_db[str_detect(new_db[,2], regex(query.reg, ignore_case = TRUE)), 1]
         db.match = union(db.match, db.match.word)
+        if (length(db.match) == 0) {
+            print(paste0("No match for query: ", query))
+        } 
     } 
     return(db.match)
 }

From 1f6d57c74983235ae12f1a621d57ab6b884b0444 Mon Sep 17 00:00:00 2001
From: leprohonmalo <leprohonmalo@gmail.com>
Date: Tue, 18 Feb 2020 10:20:50 +0800
Subject: [PATCH 07/10] Added cell type selection to panelProjection

---
 R/panelProjection.R | 34 ++++++++++++++++++++++++++++------
 1 file changed, 28 insertions(+), 6 deletions(-)

diff --git a/R/panelProjection.R b/R/panelProjection.R
index ea674ae..820ab88 100644
--- a/R/panelProjection.R
+++ b/R/panelProjection.R
@@ -1,6 +1,8 @@
+source("ctQueryReg.R")
+
 #' Compute Reference Component features for clustering analysis
 #'
-#' @param rca.obj RCA object.
+#' @param sc_data data field from an RCA object.
 #' @param panel Reference panel as a matrix
 #' @param corMeth Any of the correlation measures supported by R, defaults to pearson
 #' @param power power to raise up to for the RCA features before clustering, default is 4
@@ -11,21 +13,42 @@
 #' @export
 #'
 
-panelProjection = function(sc_data, panel, corMeth="pearson", power=4, scale=TRUE, apply_threshold=FALSE, threshold=NULL) {
+panelProjection = function(sc_data,
+                           panel,
+                           corMeth="pearson",
+                           power=4,
+                           scale=TRUE,
+                           apply_threshold=FALSE,
+                           threshold=NULL,
+                           ctSelection = NULL) {
+    
+    # Apply selection of cell types
+    if (!is.null(ctSelection)) {
+        selected.ct = ctQueryReg(ctSelection, colnames(panel))
+        if (length(selected.ct) != 0) {
+            genes.names = rownames(panel)
+            panel = as.matrix(panel[,selected.ct])
+            rownames(panel) = genes.names
+            colnames(panel) = selected.ct
+        } else {
+            print("No match found for this ctSelection, no projection was done.")
+            return(NULL)
+        }
+    }
     
     # Select genes that are shared by the input data and the panel
     shared_genes <- intersect(rownames(sc_data), rownames(panel))
     print(paste0("Projection on ", length(shared_genes), " genes."))
     
     # Reduce the panel and input data to the shared genes
-    subset_panel = panel[shared_genes, ]
+    subset_panel = as.matrix(panel[shared_genes, ])
+    colnames(subset_panel) = colnames(panel)
     subset_data = sc_data[shared_genes, , drop = FALSE]
     
     # For values in the panel below the minimum threshold, set those values to threshold
     if (apply_threshold) {
         subset_panel[subset_panel <= threshold] = threshold
     }
-    
     # Compute projection of input data with the panel
     if(corMeth == "pearson") {
         subset_panel = as.matrix(subset_panel)
@@ -46,6 +69,5 @@ panelProjection = function(sc_data, panel, corMeth="pearson", power=4, scale=TRU
                            center = TRUE,
                            scale = TRUE)
     }
-    
     return(projection)
-}
\ No newline at end of file
+}

From 5370dd1466415d43bda4c4d6263959fdc0f536bf Mon Sep 17 00:00:00 2001
From: leprohonmalo <leprohonmalo@gmail.com>
Date: Tue, 18 Feb 2020 10:24:21 +0800
Subject: [PATCH 08/10] Added cell type selection to dataProject and handling
 of failed projection due to cell type selection

---
 R/dataProject.R | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/R/dataProject.R b/R/dataProject.R
index 43e76dd..f293e01 100644
--- a/R/dataProject.R
+++ b/R/dataProject.R
@@ -1,5 +1,4 @@
 source("panelProjection.R")
-source("~/Work/RCAv2/R/panelProjection.R")
 #' Compute Reference Component features for clustering analysis
 #'
 #' @param rca.obj RCA object.
@@ -8,10 +7,17 @@ source("~/Work/RCAv2/R/panelProjection.R")
 #' @param corMeth Any of the correlation measures supported by R, defaults to pearson
 #' @param power power to raise up to for the RCA features before clustering, default is 4
 #' @param scale True if the data should be scaled, False otherwise
+#' @param ctSelection A character vector of cell types if not NULL. Only selected cell types will be kept in projection. 
 #' @return RCA object.
 #' @export
 #'
-dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corMeth = "pearson", power = 4, scale = T) {
+dataProject <- function(rca.obj,
+                        method = "GlobalPanel",
+                        customPath = NULL,
+                        corMeth = "pearson",
+                        power = 4,
+                        scale = T,
+                        ctSelection = NULL) {
 
     # Extract data
     sc_data <- rca.obj$data
@@ -39,7 +45,8 @@ dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corM
                                                    corMeth=corMeth,
                                                    power=power, scale=scale, 
                                                    apply_threshold=TRUE, 
-                                                   threshold=(ReferencePanel$at)[i])
+                                                   threshold=(ReferencePanel$at)[i],
+                                                   ctSelection = ctSelection)
         }
 
         # Combine the projection result of multiple Global Panel fragments
@@ -74,7 +81,8 @@ dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corM
             sum(x)) > 0,]
         # Store projection data
         projection= panelProjection(sc_data, panel, corMeth=corMeth,
-                                    power=power, scale=scale)
+                                    power=power, scale=scale,
+                                    ctSelection = ctSelection)
     }
     
     # If panel for correlation is ENCODEPanel
@@ -85,7 +93,8 @@ dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corM
         
         # Store projection data
         projection= panelProjection(sc_data, ENCODEPanel, corMeth=corMeth,
-                                    power=power, scale=scale)
+                                    power=power, scale=scale,
+                                    ctSelection = ctSelection)
     }
     
     # If no provided method is chosen, it is assumed that the user wishes to use a custom panel
@@ -96,9 +105,14 @@ dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corM
         
         # Store projection data
         projection = panelProjection(sc_data, panel, corMeth=corMeth,
-                                    power=power, scale=scale)
+                                    power=power, scale=scale,
+                                    ctSelection = ctSelection)
+    }
+    
+    if (is.null(projection)) {
+        print("Not any projection was succesfull for this panel.")
+        return(NULL)
     }
-
     # Store projection result as Matrix
     projection = as.matrix(projection)
     projection = as(projection, "dgCMatrix")

From 7e2e6de3a06ec47b996cff693ccb101b19e7e86e Mon Sep 17 00:00:00 2001
From: leprohonmalo <leprohonmalo@gmail.com>
Date: Tue, 18 Feb 2020 10:25:39 +0800
Subject: [PATCH 09/10] added multiRCAPanel.R, a function for combination of
 projection data.

---
 R/multiRCAPanel.R | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 R/multiRCAPanel.R

diff --git a/R/multiRCAPanel.R b/R/multiRCAPanel.R
new file mode 100644
index 0000000..f16981a
--- /dev/null
+++ b/R/multiRCAPanel.R
@@ -0,0 +1,38 @@
+#' Function to combine base reference panels for one projection.
+#' 
+#' @param rca.obj RCA object. 
+#' @param panel.name Vector of methods from dataProject to select panels ("GlobalPanel", "ColonEpitheliumPanel", "ENCODEPanel" and "Custom").
+#' @param customPath Character vector with corresponding customPath to "Custom" panel name for dataProject.
+#' @param corMeth Character vector with corresponding customPath to "Custom" panel name for dataProject.
+#' @param power Numeric vector for corresponding power parameter to pass to dataProject for each method in panel.name.
+#' @param scale Boolean vector for corresponding scale parameter to pass to dataProject for each method in panel.name.
+#' @param ctSelection List of character vector for corresponding ctSelect parameter to pass to dataProject for each method in panel.name.
+#'
+#' @export
+#' 
+multiRCAPanel = function(rca.obj, 
+                         panel.name=c("GlobalPanel", "ENCODEPanel", "ColonEpitheliumPanel"),
+                         customPath = rep(NULL, length(panel.name)),
+                         corMeth = rep("pearson", length(panel.name)),
+                         power = rep(4, length(panel.name)),
+                         scale = rep(TRUE, length(panel.name)),
+                         ctSelection = rep(NULL, length(panel.name))) {
+    
+    full.projection = list()
+    for (i in 1:length(panel.name)) {
+        panel = panel.name[i]
+        print(panel)
+        if (panel %in% c("GlobalPanel", "ENCODEPanel", "ColonEpitheliumPanel", "Custom")) {
+            dataProject(rca.obj = rca.obj, method = panel,
+                               corMeth = corMeth[i], power = power[i],
+                               scale = scale[i], ctSelection = ctSelection[[i]])
+            full.projection[[panel]] = rca.obj$projection.data
+        } else {
+            print("panel.name must be methods from dataProject().")
+        }
+    }
+    
+    rca.obj$projection.data = do.call("rbind", full.projection)
+    return(rca.obj)
+}
+

From f7db697752d6523b759e2b1928feba7cd48b3850 Mon Sep 17 00:00:00 2001
From: leprohonmalo <leprohonmalo@gmail.com>
Date: Tue, 18 Feb 2020 14:49:09 +0800
Subject: [PATCH 10/10] Added a function to select cell types based on a tf-idf
 score, but did not implement this function in other functions yet.

---
 R/ctQueryTFIDF.R | 100 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 100 insertions(+)
 create mode 100644 R/ctQueryTFIDF.R

diff --git a/R/ctQueryTFIDF.R b/R/ctQueryTFIDF.R
new file mode 100644
index 0000000..592c37e
--- /dev/null
+++ b/R/ctQueryTFIDF.R
@@ -0,0 +1,100 @@
+#' Function to parse from a character vector all character matching keywords from a query using TF-IDF metric.
+#' 
+#' @param query character or character vector.
+#' @param db a character vector to parse query from. If remakedb=FALSE it should be a matrix with each extracted word of db as second column and with corresonding full character of db as first column.
+#' @param remakedb a boolean value, default is TRUE. Put to FALSE (you should not do this) if you can provide a matrix as db as describe in db parameter.  
+#' @return a character vector with all matches for one query.
+#' @details 
+#' This function will look for matches between a query and a character database using TF-IDF for text mining.
+#' This function is not perfect and require some good practices:
+#' - Take look at your database to design your query.
+#' - Include both litteral and acronym form (if it exist) of a term in distinct query character.
+#' - Avoid using plural forms in query.
+#' - Correclty spell your query.
+#' - Database will not always be clean, and concatenated words (for instance: forinstance)
+#' will not be detected but with a specific corresponding concatened query.
+#' @export
+#' 
+ctQueryTFIDF = function(query, db, threshold=1, remakedb=TRUE) {
+    
+    # Parse words in db if needed
+    if (remakedb) {
+        new_db = matrix(ncol = 2)
+        for (i in db) {
+            ibis = gsub('([[:upper:]][[:lower:]])', ' \\1', i)
+            words = as.vector(str_split(ibis, regex("[[:punct:] ]+"), simplify = TRUE))
+            one_ct = matrix(data = cbind(rep(i,length(words)), words), ncol=2, nrow = length(words))
+            new_db = rbind(new_db, one_ct)
+        }
+        colnames(new_db) = c("book", "word")
+        new_db = as.data.frame(new_db[-c(1,which(new_db[,2] == "")),])
+        
+        # Homogenize word spelling:
+        for (wd in unique(new_db[,2])) {
+            word.reg = paste0("^", wd, "s?$")
+            db.match.word = str_detect(new_db[,2], regex(word.reg, ignore_case = TRUE))
+            # May need optimazation
+            new_db[which(db.match.word),2] = wd
+        }
+        
+        # Compute word count and tf_idf
+        new_db = new_db %>% count(book, word)
+        total_words = new_db %>% 
+            group_by(book) %>% 
+            summarize(total = sum(n))
+        new_db = left_join(new_db, total_words)
+        new_db = new_db %>%
+            bind_tf_idf(word, book, n)    
+        
+        # Create books x words tf_idf matrix
+        db.words = unique(new_db$word)
+        db.names = unique(new_db$book)
+        tfidf.table = matrix(0L, nrow = length(db.words), ncol = length(db.names))
+        colnames(tfidf.table) = sort(db.names)
+        rownames(tfidf.table) = sort(db.words)
+        for (i in 1:nrow(new_db)) {
+            tfidf.table[as.vector(new_db$word[i]), as.vector(new_db$book[i])] = new_db$tf_idf[i]
+        }
+        
+    } else {
+        tfidf.table = db
+    }
+    # If user provides several queries for one db, apply this function for each query
+    if (length(query) > 1) {
+        db.match.list = apply(as.array(query), MARGIN = 1, FUN = "ctQueryTFIDF",
+                              db=tfidf.table, threshold=threshold, remakedb=FALSE)
+        db.match = unlist(db.match.list)
+        if (length(which(duplicated(db.match))) > 0) {
+            db.match = db.match[-which(duplicated(db.match))]
+        }
+    } else {
+        
+        # Parse words in query and find match in db for each words
+        query.words = gsub('([[:upper:]][[:lower:]])', ' \\1', query)
+        query.words = as.vector(str_split(query.words, regex("[[:punct:] ]+"), simplify = TRUE))
+        query.words = query.words[query.words != ""]
+        query.occ = matrix(rep(0, nrow(tfidf.table)), ncol = nrow(tfidf.table), nrow = 1)
+        colnames(query.occ) = rownames(tfidf.table)
+        for (i in query.words) {
+            word.reg = paste0("^", i, "s?$")
+            query.occ[1,str_detect(colnames(query.occ), regex(word.reg, ignore_case = TRUE))] = 1
+        }
+        
+        # Compute cell type's score based on words match and words tfidf 
+        ct.score = query.occ %*% tfidf.table
+        db.match = colnames(ct.score)[ct.score[1,] >= threshold]
+        
+        # Try to find matches for a concatened form of the full query
+        query.occ = matrix(rep(0, nrow(tfidf.table)), ncol = nrow(tfidf.table), nrow = 1)
+        colnames(query.occ) = rownames(tfidf.table)
+        query.reg = paste0(c("^", query.words, "s?$"), collapse = "")
+        query.occ[1,str_detect(colnames(query.occ), regex(query.reg, ignore_case = TRUE))] = 1
+        ct.score = query.occ %*% tfidf.table
+        db.match.word = colnames(ct.score)[ct.score[1,] > 0.5]
+        db.match = union(db.match, db.match.word)
+        if (length(db.match) == 0) {
+            print(paste0("No match for query: ", query))
+        } 
+    } 
+    return(db.match)
+}