Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions R/ctQueryReg.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#' Function to parse from a character vector all character matching all keywords from a query.
#'
#' @param query character or character vector.
#' @param db a character vector to parse query from. If remakedb=FALSE it should be a matrix with each extracted word of db as second column and with corresonding full character of db as first column.
#' @param remakedb a boolean value, default is TRUE. Put to FALSE (you should not do this) if you can provide a matrix as db as describe in db parameter.
#' @return a character vector with all matches for one query.
#' @details
#' This function will look for matches between a query and a character database using regular expression for text mining.
#' This function is not perfect and require some good practices:
#' - Take look at your database to design your query.
#' - Include both litteral and acronym form (if it exist) of a term in distinct query character.
#' - Avoid using plural forms in query.
#' - Correclty spell your query.
#' - Database will not always be clean, and concatenated words (for instance: forinstance)
#' will not be detected but with a specific corresponding concatened query.
#' @export
#'
ctQueryReg = function(query, db, remakedb=TRUE) {
# Parse words in db if needed
if (remakedb) {
new_db = matrix(ncol = 2)
for (i in db) {
ibis = gsub('([[:upper:]][[:lower:]])', ' \\1', i)
words = as.vector(str_split(ibis, regex("[[:punct:] ]+"), simplify = TRUE))
one_ct = matrix(data = cbind(rep(i,length(words)), words), ncol=2, nrow = length(words))
new_db = rbind(new_db, one_ct)
}
colnames(new_db) = c("book", "word")
new_db = as.data.frame(new_db[-c(1,which(new_db[,2] == "")),])
} else {
new_db = db
}
# If user provides several queries for one db, apply this function for each query
if (length(query) > 1) {
db.match.list = apply(as.array(query), MARGIN = 1, FUN = "ctQueryReg", db=new_db, remakedb=FALSE)
db.match = unlist(db.match.list)
if (length(which(duplicated(db.match))) > 0) {
db.match = db.match[-which(duplicated(db.match))]
}
} else {
# Parse words in query
query.words = gsub('([[:upper:]][[:lower:]])', ' \\1', query)
query.words = as.vector(str_split(query.words, regex("[[:punct:] ]+"), simplify = TRUE))
query.words = query.words[query.words != ""]
db.match = c()
# For each words in query, detect matching characters in db and
# Keep intersect with mactches of other words
for (i in 1:length(query.words)) {
query.reg = paste0("^", query.words[i], "s?$")
db.match.word = new_db[str_detect(new_db[,2], regex(query.reg, ignore_case = TRUE)), 1]
if (i == 1) {
db.match = db.match.word
} else {
db.match = intersect(db.match, db.match.word)
}
}
# Try to find matches for a concatened form of the full query
query.reg = paste0(c("^", query.words, "S?$"), collapse = "")
db.match.word = new_db[str_detect(new_db[,2], regex(query.reg, ignore_case = TRUE)), 1]
db.match = union(db.match, db.match.word)
if (length(db.match) == 0) {
print(paste0("No match for query: ", query))
}
}
return(db.match)
}
100 changes: 100 additions & 0 deletions R/ctQueryTFIDF.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#' Function to parse from a character vector all character matching keywords from a query using TF-IDF metric.
#'
#' @param query character or character vector.
#' @param db a character vector to parse query from. If remakedb=FALSE it should be a matrix with each extracted word of db as second column and with corresonding full character of db as first column.
#' @param remakedb a boolean value, default is TRUE. Put to FALSE (you should not do this) if you can provide a matrix as db as describe in db parameter.
#' @return a character vector with all matches for one query.
#' @details
#' This function will look for matches between a query and a character database using TF-IDF for text mining.
#' This function is not perfect and require some good practices:
#' - Take look at your database to design your query.
#' - Include both litteral and acronym form (if it exist) of a term in distinct query character.
#' - Avoid using plural forms in query.
#' - Correclty spell your query.
#' - Database will not always be clean, and concatenated words (for instance: forinstance)
#' will not be detected but with a specific corresponding concatened query.
#' @export
#'
ctQueryTFIDF = function(query, db, threshold=1, remakedb=TRUE) {

# Parse words in db if needed
if (remakedb) {
new_db = matrix(ncol = 2)
for (i in db) {
ibis = gsub('([[:upper:]][[:lower:]])', ' \\1', i)
words = as.vector(str_split(ibis, regex("[[:punct:] ]+"), simplify = TRUE))
one_ct = matrix(data = cbind(rep(i,length(words)), words), ncol=2, nrow = length(words))
new_db = rbind(new_db, one_ct)
}
colnames(new_db) = c("book", "word")
new_db = as.data.frame(new_db[-c(1,which(new_db[,2] == "")),])

# Homogenize word spelling:
for (wd in unique(new_db[,2])) {
word.reg = paste0("^", wd, "s?$")
db.match.word = str_detect(new_db[,2], regex(word.reg, ignore_case = TRUE))
# May need optimazation
new_db[which(db.match.word),2] = wd
}

# Compute word count and tf_idf
new_db = new_db %>% count(book, word)
total_words = new_db %>%
group_by(book) %>%
summarize(total = sum(n))
new_db = left_join(new_db, total_words)
new_db = new_db %>%
bind_tf_idf(word, book, n)

# Create books x words tf_idf matrix
db.words = unique(new_db$word)
db.names = unique(new_db$book)
tfidf.table = matrix(0L, nrow = length(db.words), ncol = length(db.names))
colnames(tfidf.table) = sort(db.names)
rownames(tfidf.table) = sort(db.words)
for (i in 1:nrow(new_db)) {
tfidf.table[as.vector(new_db$word[i]), as.vector(new_db$book[i])] = new_db$tf_idf[i]
}

} else {
tfidf.table = db
}
# If user provides several queries for one db, apply this function for each query
if (length(query) > 1) {
db.match.list = apply(as.array(query), MARGIN = 1, FUN = "ctQueryTFIDF",
db=tfidf.table, threshold=threshold, remakedb=FALSE)
db.match = unlist(db.match.list)
if (length(which(duplicated(db.match))) > 0) {
db.match = db.match[-which(duplicated(db.match))]
}
} else {

# Parse words in query and find match in db for each words
query.words = gsub('([[:upper:]][[:lower:]])', ' \\1', query)
query.words = as.vector(str_split(query.words, regex("[[:punct:] ]+"), simplify = TRUE))
query.words = query.words[query.words != ""]
query.occ = matrix(rep(0, nrow(tfidf.table)), ncol = nrow(tfidf.table), nrow = 1)
colnames(query.occ) = rownames(tfidf.table)
for (i in query.words) {
word.reg = paste0("^", i, "s?$")
query.occ[1,str_detect(colnames(query.occ), regex(word.reg, ignore_case = TRUE))] = 1
}

# Compute cell type's score based on words match and words tfidf
ct.score = query.occ %*% tfidf.table
db.match = colnames(ct.score)[ct.score[1,] >= threshold]

# Try to find matches for a concatened form of the full query
query.occ = matrix(rep(0, nrow(tfidf.table)), ncol = nrow(tfidf.table), nrow = 1)
colnames(query.occ) = rownames(tfidf.table)
query.reg = paste0(c("^", query.words, "s?$"), collapse = "")
query.occ[1,str_detect(colnames(query.occ), regex(query.reg, ignore_case = TRUE))] = 1
ct.score = query.occ %*% tfidf.table
db.match.word = colnames(ct.score)[ct.score[1,] > 0.5]
db.match = union(db.match, db.match.word)
if (length(db.match) == 0) {
print(paste0("No match for query: ", query))
}
}
return(db.match)
}
170 changes: 56 additions & 114 deletions R/dataProject.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
source("panelProjection.R")
#' Compute Reference Component features for clustering analysis
#'
#' @param rca.obj RCA object.
Expand All @@ -6,10 +7,17 @@
#' @param corMeth Any of the correlation measures supported by R, defaults to pearson
#' @param power power to raise up to for the RCA features before clustering, default is 4
#' @param scale True if the data should be scaled, False otherwise
#' @param ctSelection A character vector of cell types if not NULL. Only selected cell types will be kept in projection.
#' @return RCA object.
#' @export
#'
dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corMeth = "pearson", power = 4, scale = T) {
dataProject <- function(rca.obj,
method = "GlobalPanel",
customPath = NULL,
corMeth = "pearson",
power = 4,
scale = T,
ctSelection = NULL) {

# Extract data
sc_data <- rca.obj$data
Expand All @@ -22,45 +30,15 @@ dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corM

# For each fragment of the Global Panel
for (i in 1:length(ReferencePanel[[1]])) {

# Initialise panel
panel = ReferencePanel[[1]][[i]]

# Select genes that are shared by the input data and the panel
shared_genes <- intersect(rownames(sc_data), rownames(panel))

# Reduce the panel and input data to the shared genes
subset_panel = panel[shared_genes, ]
subset_data = sc_data[shared_genes, , drop = FALSE]

# For values in the panel below the minimum threshold, set those values to threshold
subset_panel[subset_panel <= (ReferencePanel$at)[i]] = (ReferencePanel$at)[i]

# Compute projection of input data with the panel fragment
if(corMeth == "pearson") {
subset_panel = as.matrix(subset_panel)
projection_fragment <- qlcMatrix::corSparse(X = subset_panel, Y = subset_data)
} else {
projection_fragment <- cor(subset_panel, subset_data, method = corMeth)
}


# Reattach dimnames
colnames(projection_fragment) <- colnames(subset_data)
rownames(projection_fragment) <- colnames(subset_panel)

# Raise the projection fragment to power
projection_fragment = abs(projection_fragment) ^ (power) * sign(projection_fragment)

# If scaling is required
if (scale) {

# Scale
projection_fragment = scale(projection_fragment, center = TRUE, scale = TRUE)
}


# Store projection data of fragment of Global Panel
projection_list[[i]] = projection_fragment
projection_list[[i]] = panelProjection(sc_data,
ReferencePanel[[1]][[i]],
corMeth=corMeth,
power=power, scale=scale,
apply_threshold=TRUE,
threshold=(ReferencePanel$at)[i],
ctSelection = ctSelection)
}

# Combine the projection result of multiple Global Panel fragments
Expand All @@ -70,97 +48,60 @@ dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corM
# If panel for correlation is ColonEpitheliumPanel
else if (method == "ColonEpitheliumPanel") {

panel = ReferencePanel$ColonEpiPanel
# Convert rownames to gene symbol and sum duplicates
gene.names = as.character(str_extract_all(rownames(panel), "_.+_"))
gene.names = str_sub(gene.names, 2, -2)
panel = cbind(panel, gene.names)
dup.genes = unique(gene.names[duplicated(gene.names)])
for (gene in dup.genes) {
sub.panel = panel[which(gene.names == gene),1:(ncol(panel) - 1)]
new.row = apply(sub.panel, MARGIN = 2, FUN = "sum")
for (i in which(gene.names == gene)) {
panel[i, 1:(ncol(panel) - 1)] = new.row
}
}
panel = panel[-which(duplicated(panel$gene.names)),]
rownames(panel) = panel$gene.names
panel = panel[,-ncol(panel)]
# Scale panel by median
fc = apply(ReferencePanel$ColonEpiPanel, 1, function(x) x - median(x))

fc = apply(panel, 1, function(x) x - median(x))
fs = fc > 1.5

fs1 = rownames(ReferencePanel$ColonEpiPanel[apply(fs, 1, function(x)
sum(x)) > 0,])
gl_intersect = intersect(rownames(fpkm_temp), fs1)
projection = as.data.frame(cor(fpkm_temp[gl_intersect,], ReferencePanel$ColonEpiPanel[gl_intersect,], corMeth))
projection = abs(projection) ^ (power) * sign(projection)
if (scale) {
projection = scale(projection,
center = TRUE,
scale = TRUE)
}
panel = panel[apply(fs, 1, function(x)
sum(x)) > 0,]
# Store projection data
projection= panelProjection(sc_data, panel, corMeth=corMeth,
power=power, scale=scale,
ctSelection = ctSelection)
}

# If any other panel is chosen
else if (method %in% names(ReferencePanel)) {

panel <- ReferencePanel[[method]]

# Initialise variable to store projection data from the two fragments of the Global Panel
projection_list = list()

# Select genes that are shared by the input data and the panel
shared_genes <- intersect(rownames(sc_data), rownames(panel))

# Reduce the panel and input data to the shared genes
subset_panel = panel[shared_genes, ]
subset_data = sc_data[shared_genes, , drop = FALSE]

# Compute projection of input data with the panel
if(corMeth == "pearson") {
subset_panel = as.matrix(subset_panel)
projection <- qlcMatrix::corSparse(X = subset_panel, Y = subset_data)
} else {
projection <- cor(subset_panel, subset_data, method = corMeth)
}
rownames(projection) <- colnames(subset_panel)
colnames(projection) <- colnames(subset_data)

# Raise the projection to power
projection = abs(projection) ^ (power) * sign(projection)

# If scaling is required
if (scale) {

# Scale
projection = scale(projection,
center = TRUE,
scale = TRUE)
}

# Store projection data
projection= panelProjection(sc_data, panel, corMeth=corMeth,
power=power, scale=scale,
ctSelection = ctSelection)
}

# If no provided method is chosen, it is assumed that the user wishes to use a custom panel
else {

# Load panel from path provided
panel <- readRDS(customPath)

# Select genes that are shared by the input data and the panel
shared_genes <- intersect(rownames(sc_data), rownames(panel))

# Reduce the panel and input data to the shared genes
subset_panel = panel[shared_genes, ]
subset_data = sc_data[shared_genes, , drop = FALSE]

# Compute projection of input data with the panel
if(corMeth == "pearson") {
subset_panel = as.matrix(subset_panel)
projection <- qlcMatrix::corSparse(X = subset_panel, Y = subset_data)
} else {
projection <- cor(subset_panel, subset_data, method = corMeth)
}
rownames(projection) <- colnames(subset_panel)
colnames(projection) <- colnames(subset_data)

# Raise the projection to power
projection = abs(projection) ^ (power) * sign(projection)

# If scaling is required
if (scale) {

# Scale
projection = scale(projection,
center = TRUE,
scale = TRUE)
}

# Store projection data
projection = panelProjection(sc_data, panel, corMeth=corMeth,
power=power, scale=scale,
ctSelection = ctSelection)
}

if (is.null(projection)) {
print("Not any projection was succesfull for this panel.")
return(NULL)
}

# Store projection result as Matrix
projection = as.matrix(projection)
projection = as(projection, "dgCMatrix")
Expand All @@ -172,3 +113,4 @@ dataProject <- function(rca.obj, method = "GlobalPanel", customPath = NULL, corM

return(rca.obj)
}

Loading