diff --git a/NAMESPACE b/NAMESPACE index e77ecf7..9a01d89 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,10 +3,13 @@ export(colocboost) export(colocboost_assemble) export(colocboost_check_update_jk) -export(colocboost_get_methods) export(colocboost_inits) +export(colocboost_plot) +export(colocboost_post_inference) export(colocboost_update) export(colocboost_workhorse) +export(get_cos_summary) +export(get_strong_colocalization) importFrom(grDevices,adjustcolor) importFrom(graphics,abline) importFrom(graphics,axis) diff --git a/R/colocboost.R b/R/colocboost.R index 1bb1245..c53bf83 100644 --- a/R/colocboost.R +++ b/R/colocboost.R @@ -31,39 +31,42 @@ #' The first column should be 1:L for L sumstat The second column should be the index of \code{LD} corresponding to the sumstat. #' The innovation: do not provide the same matrix in \code{LD} to reduce the computational burden. #' @param outcome_names The names of outcomes, which has the same order for Y. -#' @param target_idx The index of the target outcome if perform targeted ColocBoost -#' @param effect_est Matrix of variable regression coefficients (i.e. regression beta values) in the genomic region -#' @param effect_se Matrix of standard errors associated with the beta values -#' @param effect_n A scalar or a vector of sample sizes for estimating regression coefficients. Highly recommendated! -#' @param target_variables If \code{target_variables = TRUE}, only consider the variables exsit in the target outcome. +#' @param target_outcome_idx The index of the target outcome if perform GWAS-xQTL ColocBoost +#' @param target_outcome_variables If \code{target_outcome_variables = TRUE}, only consider the variables exist in the target outcome. #' @param overlap_variables If \code{overlap_variables = TRUE}, only perform colocalization in the overlapped region. #' @param intercept If \code{intercept = TRUE}, the intercept is fitted. Setting \code{intercept = FALSE} is generally not recommended. #' @param standardize If \code{standardize = TRUE}, standardize the columns of genotype and outcomes to unit variance. +#' @param effect_est Matrix of variable regression coefficients (i.e. regression beta values) in the genomic region +#' @param effect_se Matrix of standard errors associated with the beta values +#' @param effect_n A scalar or a vector of sample sizes for estimating regression coefficients. Highly recommended! #' #' @section Model Parameters -#' @param M The maximum number of gradient boosting iterations. If the number of outcomes are large, it will be automatically increased to a larger number. +#' @param M The maximum number of gradient boosting rounds. If the number of outcomes are large, it will be automatically increased to a larger number. #' @param stop_thresh The stop criterion for overall profile loglikelihood function. -#' @param step The minimum step size (learning rate) for updating in each iteration. -#' @param decayrate The decayrate for step size. If the objective function is large at the early iterations, -#' we need to have the higher step size to improve the computational efficiency. #' @param tau The smooth parameter for proximity adaptive smoothing weights for the best update jk-star. +#' @param learning_rate_init The minimum learning rate for updating in each iteration. +#' @param learning_rate_decay The decayrate for learning rate. If the objective function is large at the early iterations, +#' we need to have the higher learning rate to improve the computational efficiency. #' @param prioritize_jkstar When \code{prioritize_jkstar = TRUE}, the selected outcomes will prioritize best update j_k^star in SEC. #' @param func_compare The criterion when we update jk-star in SEC (default is "min_max"). -#' @param jk_equiv_cor The LD cutoff between overall best update jk-star and marginal best update jk-l for lth outcome +#' @param jk_equiv_corr The LD cutoff between overall best update jk-star and marginal best update jk-l for lth outcome #' @param jk_equiv_loglik The change of loglikelihood cutoff between overall best update jk-star and marginal best update jk-l for lth outcome -#' @param coloc_thres The cutoff of checking if the best update jk-star is the potential causal variable for outcome l if jk-l is not similar to jk-star (used in Delayed SEC). +#' @param coloc_thresh The cutoff of checking if the best update jk-star is the potential causal variable for outcome l if jk-l is not similar to jk-star (used in Delayed SEC). #' @param lambda The ratio [0,1] for z^2 and z in fun_prior simplex, defult is 0.5 -#' @param lambda_target The ratio for z^2 and z in fun_prior simplex for the target outcome, default is 1 -#' @param func_prior The data-drive local association simplex \eqn{\delta} for smoothing the weights. Default is "LD_z2z" is the elastic net for z-score and also weighted by LD. -#' @param func_multicorrection The alternative method to check the stop criteria. When \code{func_multicorrection = "lfdr"}, boosting iterations will be stopped +#' @param lambda_target_outcome The ratio for z^2 and z in fun_prior simplex for the target outcome, default is 1 +#' @param func_simplex The data-driven local association simplex \eqn{\delta} for smoothing the weights. Default is "LD_z2z" is the elastic net for z-score and also weighted by LD. +#' @param func_multi_test The alternative method to check the stop criteria. When \code{func_multi_test = "lfdr"}, boosting iterations will be stopped #' if the local FDR for all variables are greater than \code{lfsr_max}. -#' @param stop_null The cutoff of nominal p-value when \code{func_multicorrection = "Z"}. -#' @param multicorrection_max The cutoff of the smallest FDR for pre-filtering the outcomes when \code{func_multicorrection = "lfdr"} or \code{func_multicorrection = "lfsr"}. -#' @param multicorrection_cut The cutoff of the smallest FDR for stop criteria when \code{func_multicorrection = "lfdr"} or \code{func_multicorrection = "lfsr"}. +#' @param stop_null The cutoff of nominal p-value when \code{func_multi_test = "Z"}. +#' @param multi_test_max The cutoff of the smallest FDR for pre-filtering the outcomes when \code{func_multi_test = "lfdr"} or \code{func_multi_test = "lfsr"}. +#' @param multi_test_thresh The cutoff of the smallest FDR for stop criteria when \code{func_multi_test = "lfdr"} or \code{func_multi_test = "lfsr"}. +#' @param ash_prior The prior distribution for calculating lfsr when \code{func_multi_test = "lfsr"}. +#' @param p.adjust.methods The adjusted pvalue method in stats:p.adj when \code{func_multi_test = "fdr"} +#' @param residual_correlation The residual correlation based on the sample overlap, it is diagonal if it is NULL. #' -#' @section Post-processing Parameters +#' @section Post Inference Parameters #' @param coverage A number between 0 and 1 specifying the \dQuote{coverage} of the estimated colocalization confidence sets (CoS) (default is 0.95). -#' @param between_cluster The small correlation for the weights distributions across different iterations to be decided having only one cluster. +#' @param min_cluster_corr The small correlation for the weights distributions across different iterations to be decided having only one cluster. #' @param dedup If \code{dedup = TRUE}, the duplicate confidence sets will be removed in the post-processing. #' @param overlap If \code{overlap = TRUE}, the overlapped confidence sets will be removed in the post-processing. #' @param n_purity The maximum number of confidence set (CS) variables used in calculating the correlation (\dQuote{purity}) statistics. @@ -73,30 +76,27 @@ #' @param median_abs_corr An alternative "purity" threshold for the CS. Median correlation between pairs of variables in a CS less than this #' threshold will be filtered out and not reported. When both min_abs_corr and median_abs_corr are set, #' a CS will only be removed if it fails both filters. Default set to NULL but it is recommended to set it to 0.8 in practice. -#' @param between_purity Minimum absolute correlation allowed to merge multiple colocalized sets. The default is 0.8 corresponding to a stringent threshold +#' @param median_cos_abs_corr Median absolute correlation between variants allowed to merge multiple colocalized sets. The default is 0.8 corresponding to a stringent threshold #' to merge colocalized sets, which may resulting in a huge set. #' @param tol A small, non-negative number specifying the convergence tolerance for checking the overlap of the variables in different sets. -#' @param merging When \code{merging = TRUE}, the sets for only one outcome will be merged if passed the \code{between_purity}. -#' @param coverage_singlew A number between 0 and 1 specifying the weight in each SEC (default is 0.8). -#' @param func_intw The integrated weight method. The default is "fun_R", indicating the same log-scale for different colocalized outcomes. -#' @param alpha The strenght to integrate weight from differnt outcomes, default is 1.5 -#' @param ash_prior The prior distribution for calculating lfsr when \code{func_multicorrection = "lfsr"}. -#' @param p.adjust.methods The adjusted pvalue method in stats:p.adj when \code{func_multicorrection = "fdr"} +#' @param merge_cos When \code{merge_cos = TRUE}, the sets for only one outcome will be merged if passed the \code{median_cos_abs_corr}. +#' @param sec_coverage_thresh A number between 0 and 1 specifying the weight in each SEC (default is 0.8). +#' @param weight_fudge_factor The strenght to integrate weight from differnt outcomes, default is 1.5 #' @param check_null The cut off value for change conditional objective function. Default is 0.1. #' @param check_null_method The metric to check the null sets. Default is "profile" #' @param check_null_max The smallest value of change of profile loglikelihood for each outcome. -#' @param residual_correlation The residual correlation based on the sample overlap, it is diagonal if it is NULL. -#' @param weaker_ucos If \code{weaker_ucos = TRUE}, consider the weaker single effect due to coupling effects -#' @param LD_obj When \code{LD_obj = FALSE}, objective fuunction doesn't include LD information. +#' @param weaker_effect If \code{weaker_effect = TRUE}, consider the weaker single effect due to coupling effects +#' @param LD_free When \code{LD_free = FALSE}, objective function doesn't include LD information. #' @param output_level When \code{output_level = 2}, return the ucos details for the single specific effects. #' When \code{output_level = 3}, return the entire Colocboost model to diagnostic results (more space). #' #' @return A \code{"colocboost"} object with some or all of the following elements: #' #' \item{cos_summary}{A summary table for colocalization events.} -#' \item{cos_details}{A object with all information for colocalization results.} #' \item{vcp}{The variable colocalized probability for each variable.} +#' \item{cos_details}{A object with all information for colocalization results.} #' \item{data_info}{A object with detailed information from input data} +#' \item{model_info}{A object with detailed information for colocboost model} #' #' @importFrom stats na.omit #' @export @@ -110,61 +110,58 @@ colocboost <- function(X = NULL, Y = NULL, # individual data dict_YX = NULL, # Y index for 1st column, X index for 2nd column dict_sumstatLD = NULL, # sumstat index for 1st column, LD index for 2nd column outcome_names = NULL, # the names of outcomes + ###### - GWAS-xQTL verison + target_outcome_idx = NULL, + target_outcome_variables = TRUE, + overlap_variables = FALSE, + intercept=TRUE, # centered genotype and phenotype + standardize=TRUE, # standardized genotype and phenotype ###### - HyPrColoc input effect_est = NULL, # same as HyPrColoc, beta hat matrix: with rowname of variable names effect_se = NULL, # same as HyPrColoc, sebeta hat matrix with rowname of variable names effect_n = NULL, - ###### - fixed argument - ########### - # - about stop + ###### - Model Parameters M = NULL, # maximum iteration time stop_thresh = 1e-06, # stop criterion for profile_log and objective functions - # - about update - step = 0.01, # minimum step size for updating in each iteration; supplefigure? - decayrate = 1, # decayrate for step size - tau = 0.01, # kernal_tau parameter for smoothing weights; supplefigure? + tau = 0.01, # kernal_tau parameter for proximity smoothing weight + learning_rate_init = 0.01, # minimum step size for updating in each boosting round + learning_rate_decay = 1, # decay of learning rate + dynamic_learning_rate = TRUE, # if dynamic learning rate prioritize_jkstar = TRUE, # if prioritize j_k^star for updating - func_compare = "min_max", # the criterion when we update j_k^star; supplefigure? - jk_equiv_cor = 0.8, # check if jk_star ~ jk_r + func_compare = "min_max", # the criterion when we update j_k^star in Logic 3 + jk_equiv_corr = 0.8, # check if jk_star ~ jk_r jk_equiv_loglik = 1, # check if jk_star ~ jk_r. - coloc_thres = 0.1, - # - about data - intercept=TRUE, # centered genotype and phenotype - standardize=TRUE, # standardized genotype and phenotype - # - about post-processing - coverage = 0.95, # coverage of the confident set - between_cluster = 0.8, # only one cluster if all weights correlations bigger than this cut off - dedup = TRUE, # if remove the duplicate csets in the post-processing - overlap = TRUE, # if remove the overlapped csets - n_purity = 100, # the number of variables in purity - min_abs_corr = 0.5, # the cut off value of purity in each cset - median_abs_corr = NULL, - between_purity = 0.8, # minimum LD between two csets - tol = 1e-9, # tol for LD - merging = TRUE, # if merge two sets for one outcome - coverage_singlew = 0.8, + coloc_thresh = 0.1, lambda = 0.5, # the ratio for z^2 and z in weight penalty - lambda_target = 1, - func_intw = "fun_R", # integrated weight method - alpha = 1.5, # integrated weight smooth ratio - func_prior = "LD_z2z", # penalty for weights - func_multicorrection = "lfdr", - # --- add-hoc - target_idx = NULL, # important now for sumstat - target_variables = TRUE, - overlap_variables = FALSE, + lambda_target_outcome = 1, # the ratio for z^2 and z in weight penalty for target outcome + func_simplex = "LD_z2z", # data-driven association simplex + func_multi_test = "lfdr", stop_null = 1, - multicorrection_max = 1, - multicorrection_cut = 1, - ash_prior = "normal", # only applicable if func_multicorrection = lfsr + multi_test_max = 1, + multi_test_thresh = 1, + ash_prior = "normal", # only applicable if func_multi_test = lfsr p.adjust.methods = "fdr", + residual_correlation = NULL, # phenotypic correlation, it is diagonal if it is NULL + + ###### - Post Inference Parameters + coverage = 0.95, # coverage of CoS + min_cluster_corr = 0.8, # only one cluster if all weights correlations bigger than this cut off + dedup = TRUE, # if remove the duplicate CoS in the post-processing + overlap = TRUE, # if remove the overlapped CoS + n_purity = 100, # the number of variables in purity + min_abs_corr = 0.5, # the cut off value of purity in each CoS + median_abs_corr = NULL, + median_cos_abs_corr = 0.8, # minimum LD between two CoS + tol = 1e-9, # tol for LD + merge_cos = TRUE, # if merge two sets for one outcome + sec_coverage_thresh = 0.8, + weight_fudge_factor = 1.5, # integrative weight check_null = 0.1, # the cut off value for change conditional objective function - check_null_method = "profile", - check_null_max = 0.02, - residual_correlation = NULL, # sample overlap, it is diagonal if it is NULL - LD_obj = FALSE, - dynamic_step = TRUE, - weaker_ucos = TRUE, + check_null_method = "profile", # the metric to check the null sets. + check_null_max = 0.02, # the smallest value of change of profile loglikelihood for each outcome. + weaker_effect = TRUE, + LD_free = FALSE, output_level = 1){ @@ -234,7 +231,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data }) } - keep.variable.individual <- lapply(X, colnames) + keep_variable_individual <- lapply(X, colnames) if (!is.list(X) & !is.list(Y)){ warning("Error: Input X and Y must be the list containing genotype matrics and all phenotype vectors!") return(NULL) @@ -270,7 +267,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data } } } - # keep.variable.individual <- lapply(yx_dict, function(i) keep.variable.individual[[i]] ) + # keep_variable_individual <- lapply(yx_dict, function(i) keep_variable_individual[[i]] ) if (any(sapply(X, anyNA))) { warning("Error: Input X must not contain missing values.") return(NULL) @@ -307,7 +304,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data } } } - } else {yx_dict <- keep.variable.individual <- NULL} + } else {yx_dict <- keep_variable_individual <- NULL} # - check summary-level data if ( (!is.null(sumstat)) | (!is.null(effect_est) & !is.null(effect_se)) ){ @@ -361,7 +358,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data }) } } - keep.variable.sumstat <- lapply(sumstat, function(xx){ xx$variant}) + keep_variable_sumstat <- lapply(sumstat, function(xx){ xx$variant}) # --- check input of LD if (is.null(LD)){ @@ -371,15 +368,15 @@ colocboost <- function(X = NULL, Y = NULL, # individual data "Without LD, only a single iteration will be performed under the assumption of one causal variable per outcome. ", "Additionally, the purity of CoS cannot be evaluated!") - p.sumstat <- sapply(keep.variable.sumstat, length) + p.sumstat <- sapply(keep_variable_sumstat, length) p.unique <- unique(p.sumstat) if (length(p.unique)==1){ ld <- diag(1, nrow = p.unique) - colnames(ld) <- rownames(ld) <- keep.variable.sumstat[[1]] + colnames(ld) <- rownames(ld) <- keep_variable_sumstat[[1]] LD <- list(ld) sumstatLD_dict <- rep(1, length(sumstat)) } else { - LD <- lapply(keep.variable.sumstat, function(sn){ + LD <- lapply(keep_variable_sumstat, function(sn){ ld <- diag(1, nrow = length(sn)) colnames(ld) <- rownames(ld) <- sn return(ld) @@ -390,9 +387,9 @@ colocboost <- function(X = NULL, Y = NULL, # individual data # change some algorithm parameters M <- 1 # one iteration min_abs_corr = 0 # remove purity checking - jk_equiv_cor = 0 + jk_equiv_corr = 0 jk_equiv_loglik = 0.1 - func_prior = "only_z2z" + func_simplex = "only_z2z" } else { @@ -500,13 +497,13 @@ colocboost <- function(X = NULL, Y = NULL, # individual data Z[[i.summstat]] <- z } } else { - Z <- N_sumstat <- Var_y <- SeBhat <- sumstatLD_dict <- keep.variable.sumstat <- NULL + Z <- N_sumstat <- Var_y <- SeBhat <- sumstatLD_dict <- keep_variable_sumstat <- NULL } # - initial colocboost object - keep.variables <- c(keep.variable.individual, keep.variable.sumstat) - overlapped_variables <- Reduce("intersect", keep.variables) - mean_variables <- mean(sapply(keep.variables, length)) - min_variables <- min(sapply(keep.variables, length)) + keep_variables <- c(keep_variable_individual, keep_variable_sumstat) + overlapped_variables <- Reduce("intersect", keep_variables) + mean_variables <- mean(sapply(keep_variables, length)) + min_variables <- min(sapply(keep_variables, length)) if (min_variables < 100){ warning("Warning message about the number of variables.\n", "The smallest number of variables across outcomes is ", min_variables, " <100. ", @@ -527,9 +524,9 @@ colocboost <- function(X = NULL, Y = NULL, # individual data cb_data <- colocboost_init_data(X = X, Y = Y, dict_YX = yx_dict, Z = Z, LD = LD, N_sumstat = N_sumstat, dict_sumstatLD = sumstatLD_dict, Var_y = Var_y, SeBhat = SeBhat, - keep.variables = keep.variables, - target_idx = target_idx, - target_variables = target_variables, + keep_variables = keep_variables, + target_outcome_idx = target_outcome_idx, + target_outcome_variables = target_outcome_variables, overlap_variables = overlap_variables, intercept = intercept, standardize = standardize, @@ -540,34 +537,33 @@ colocboost <- function(X = NULL, Y = NULL, # individual data cb_obj <- colocboost_workhorse(cb_data, M = M, prioritize_jkstar = prioritize_jkstar, - step = step, tau = tau, - decayrate = decayrate, - func_prior = func_prior, + learning_rate_init = learning_rate_init, + learning_rate_decay = learning_rate_decay, + func_simplex = func_simplex, lambda = lambda, - lambda_target = lambda_target, + lambda_target_outcome = lambda_target_outcome, stop_thresh = stop_thresh, - func_multicorrection = func_multicorrection, + func_multi_test = func_multi_test, stop_null = stop_null, - multicorrection_max = multicorrection_max, - multicorrection_cut = multicorrection_cut, + multi_test_max = multi_test_max, + multi_test_thresh = multi_test_thresh, ash_prior = ash_prior, p.adjust.methods = p.adjust.methods, - jk_equiv_cor = jk_equiv_cor, + jk_equiv_corr = jk_equiv_corr, jk_equiv_loglik = jk_equiv_loglik, func_compare = func_compare, - coloc_thres = coloc_thres, - LD_obj = LD_obj, - dynamic_step = dynamic_step, - target_idx = target_idx, + coloc_thresh = coloc_thresh, + LD_free = LD_free, + dynamic_learning_rate = dynamic_learning_rate, + target_outcome_idx = target_outcome_idx, outcome_names = outcome_names) # --- post-processing of the colocboost updates message("Starting assemble analyses and results summary.") cb_output <- colocboost_assemble(cb_obj, coverage = coverage, - func_intw = func_intw, - alpha = alpha, + weight_fudge_factor = weight_fudge_factor, check_null = check_null, check_null_method = check_null_method, check_null_max = check_null_max, @@ -575,16 +571,15 @@ colocboost <- function(X = NULL, Y = NULL, # individual data overlap = overlap, n_purity = n_purity, min_abs_corr = min_abs_corr, - coverage_singlew = coverage_singlew, + sec_coverage_thresh = sec_coverage_thresh, median_abs_corr = median_abs_corr, - between_cluster = between_cluster, - between_purity = between_purity, - weaker_ucos = weaker_ucos, - merging = merging, + min_cluster_corr = min_cluster_corr, + median_cos_abs_corr = median_cos_abs_corr, + weaker_effect = weaker_effect, + merge_cos = merge_cos, tol = tol, output_level = output_level) class(cb_output) <- "colocboost" - return(cb_output) } diff --git a/R/colocboost_addhoc_utils.R b/R/colocboost_addhoc_utils.R deleted file mode 100644 index 77c477a..0000000 --- a/R/colocboost_addhoc_utils.R +++ /dev/null @@ -1,391 +0,0 @@ - - -merge_cos_ucos <- function(cb_obj, out_cos, out_ucos, coverage = 0.95, - min_abs_corr = 0.5, tol = 1e-9, - between_purity = 0.8){ - - - change_obj_each <- out_ucos$change_obj_each - coloc_sets <- out_cos$cos$cos - ucos_each <- out_ucos$ucos_each - - # - remove overlap between coloc_sets and single_sets - is_overlap <- is_highLD <- c() - for (i in 1:length(coloc_sets)){ - # cset1 <- coloc_sets[[i]] - coloc_outcome <- out_cos$cos$coloc_outcomes[[i]] - ttmp <- as.numeric(gsub("[^0-9.]+", "", colnames( out_cos$cos$avWeight[[i]]))) - pos_name <- order(ttmp) - out_cos$cos$avWeight[[i]] <- out_cos$cos$avWeight[[i]][,pos_name] - for (j in 1:length(ucos_each)){ - cset2 <- ucos_each[[j]] - fine_outcome <- out_ucos$ucos_outcome[j] - - if (fine_outcome %in% coloc_outcome){ - - # - if fine_y in coloc_y, we check only the overlap - pp <- which(coloc_outcome == fine_outcome) - w_tmp <- out_cos$cos$avWeight[[i]][,pp] - cset1 <- get_in_csets(w_tmp, coverage = coverage)[[1]] - # - addhoc: if between_purity > 0.8, remove single sets - X_dict <- cb_obj$cb_data$dict[fine_outcome] - res <- get_between_purity(cset1, cset2, X = cb_obj$cb_data$data[[X_dict]]$X, - Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, - N = cb_obj$cb_data$data[[fine_outcome]]$N, - miss_idx = cb_obj$cb_data$data[[fine_outcome]]$variable_miss, - P = cb_obj$cb_model_para$P) - # is.between <- length(intersect(cset1, cset2)) != 0 - is.between <- (abs(res[2]-1) < tol) - if (is.between){ - is_overlap = c(is_overlap, j) - } else { - is.higLD <- res[2] > between_purity - if (is.higLD){ is_highLD = c(is_highLD, j)} - } - - } else if (!(fine_outcome %in% coloc_outcome)){ - - # - if fine_y not in coloc_y, we check overlap and also min_purity - change_obj_coloc <- out_cos$cos$cs_change - cset1 <- coloc_sets[[i]] - res <- list() - for (ii in 1:cb_obj$cb_model_para$L){ - X_dict <- cb_obj$cb_data$dict[ii] - res[[ii]] <- get_between_purity(cset1, cset2, X = cb_obj$cb_data$data[[X_dict]]$X, - Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, - N = cb_obj$cb_data$data[[ii]]$N, - miss_idx = cb_obj$cb_data$data[[ii]]$variable_miss, - P = cb_obj$cb_model_para$P) - } - res <- Reduce(pmax, res) - min_between <- res[1] - max_between <- res[2] - ave_between <- res[3] - is.between <- ((min_between>between_purity) & (abs(max_between-1) < tol)) - # is.between <- (min_between>min_abs_corr) * (abs(max_between-1)between_purity) - if (is.between){ - is_overlap = c(is_overlap, j) - # -- add weight - add_avW <- out_ucos$avW_ucos_each[,j] - out_cos$cos$avWeight[[i]] <- cbind(add_avW, out_cos$cos$avWeight[[i]]) - colnames( out_cos$cos$avWeight[[i]])[1] <- paste0("outcome", fine_outcome) - ttmp <- as.numeric(gsub("[^0-9.]+", "", colnames( out_cos$cos$avWeight[[i]]))) - pos_name <- order(ttmp) - out_cos$cos$avWeight[[i]] <- out_cos$cos$avWeight[[i]][,pos_name] - change_obj_coloc_i <- change_obj_coloc[i,] - change_obj_each_j <- change_obj_each[j,] - out_cos$cos$cs_change[i,] <- pmax(change_obj_coloc_i, change_obj_each_j) - coloc_outcome <- sort(c(coloc_outcome, fine_outcome)) - out_cos$cos$coloc_outcomes[[i]] <- coloc_outcome - } - - } - } - } - - is_overlap <- unique(c(is_overlap,is_highLD)) - if (length(is_overlap) != 0){ - if (length(is_overlap) == length(ucos_each)){ - out_ucos$ucos_each = NULL - out_ucos$avW_ucos_each <- NULL - out_ucos$change_obj_each <- NULL - out_ucos$purity_each <- NULL - } else { - out_ucos$ucos_each = ucos_each[-is_overlap] - out_ucos$avW_ucos_each <- out_ucos$avW_ucos_each[,-is_overlap,drop = FALSE] - out_ucos$change_obj_each <- change_obj_each[-is_overlap,,drop = FALSE] - out_ucos$purity_each <- out_ucos$purity_each[-is_overlap,,drop = FALSE] - } - } - if (length( out_ucos$ucos_each) == 0){ - out_ucos$ucos_each = NULL - out_ucos$avW_ucos_each <- NULL - out_ucos$change_obj_each <- NULL - out_ucos$purity_each <- NULL - } - ll <- list("ucos" = out_ucos, "cos" = out_cos) - return(ll) - -} - -#' @importFrom stats na.omit -merge_ucos <- function(cb_obj, past_out, - min_abs_corr = 0.5, - median_abs_corr = NULL, - n_purity = 100, - between_purity = 0.8, - tol = 1e-9){ - - - out_ucos <- past_out$ucos - out_cos <- past_out$cos - ucos_each <- out_ucos$ucos_each - change_obj_each <- out_ucos$change_obj_each - - # calculate between purity - ncsets <- length(ucos_each) - min_between <- max_between <- ave_between <- matrix(0, nrow = ncsets, ncol = ncsets) - for (i.between in 1:(ncsets-1)){ - for (j.between in (i.between+1):ncsets){ - cset1 <- ucos_each[[i.between]] - cset2 <- ucos_each[[j.between]] - y.i <- out_ucos$ucos_outcome[i.between] - y.j <- out_ucos$ucos_outcome[j.between] - if (y.i == y.j){ next } - yy <- c(y.i, y.j) - res <- list() - flag <- 1 - for (ii in yy){ - X_dict <- cb_obj$cb_data$dict[ii] - res[[flag]] <- get_between_purity(cset1, cset2, X = cb_obj$cb_data$data[[X_dict]]$X, - Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, - N = cb_obj$cb_data$data[[ii]]$N, - miss_idx = cb_obj$cb_data$data[[ii]]$variable_miss, - P = cb_obj$cb_model_para$P) - flag <- flag + 1 - } - if (min_abs_corr == 0){ - res <- Reduce(pmin, res) - } else { - res <- Reduce(pmax, res) - } - min_between[i.between, j.between] <- min_between[j.between, i.between] <- res[1] - max_between[i.between, j.between] <- max_between[j.between, i.between] <- res[2] - ave_between[i.between, j.between] <- ave_between[j.between, i.between] <- res[3] - } - } - # is.between <- (min_between>min_abs_corr) * (abs(max_between-1)between_purity) - is.between <- ((min_between>between_purity) & (abs(max_between-1) < tol)) - if (sum(is.between) != 0){ - temp <- sapply(1:nrow(is.between), function(x){ - tt <- c(x, which(is.between[x,] != 0)) - return(paste0(sort(tt), collapse = ";")) - }) - temp <- merge_sets(temp) - potential_merged <- lapply(temp, function(x) as.numeric(unlist(strsplit(x, ";")))) - potential_merged <- potential_merged[which(sapply(potential_merged, length) >= 2)] - coloc_sets_merged <- avWeight_merged <- - cs_change_merged <- coloc_outcomes_merged <- list() - is_merged <- c() - for (i.m in 1:length(potential_merged)){ - temp_set <- as.numeric(potential_merged[[i.m]]) - # refine avWeight - merged <- out_ucos$avW_ucos_each[, temp_set] - unique_coloc_outcomes <- as.numeric(gsub(".*Y(\\d+).*", "\\1", colnames(merged))) - if (length(unique(unique_coloc_outcomes))==1) next - # define merged set - coloc_sets_merged <- c(coloc_sets_merged, list( unique(unlist(ucos_each[temp_set])) )) - colnames(merged) <- paste0("outcome", unique_coloc_outcomes) - coloc_outcomes_merged <- c(coloc_outcomes_merged, - list(unique(sort(unique_coloc_outcomes)))) - # colnames(temp) <- unique_coloc_outcomes - avWeight_merged <- c(avWeight_merged, list(merged)) - # refine cs_change - change_cs_tmp <- change_obj_each[temp_set, , drop = FALSE] - cs_change_merged <- c(cs_change_merged, - list(apply(change_cs_tmp, 2, max))) - is_merged <- c(is_merged, temp_set) - } - - if (length(is_merged) != 0){ - - # --- check merged coloc set purity - purity = vector(mode='list', length=length(coloc_sets_merged)) - for(ee in 1:length(coloc_sets_merged)){ - coloc_t <- coloc_outcomes_merged[[ee]] - p_tmp <- c() - for (i3 in coloc_t){ - pos <- coloc_sets_merged[[ee]] - X_dict <- cb_obj$cb_data$dict[i3] - if (!is.null(cb_obj$cb_data$data[[X_dict]]$XtX)){ - pos <- match(pos, setdiff(1:cb_obj$cb_model_para$P, cb_obj$cb_data$data[[i3]]$variable_miss)) - # - it could happen since merging - if(sum(is.na(pos)) != 0){pos <- as.numeric(na.omit(pos))} - } - tmp <- matrix(get_purity(pos,X=cb_obj$cb_data$data[[X_dict]]$X, - Xcorr=cb_obj$cb_data$data[[X_dict]]$XtX, - N=cb_obj$cb_data$data[[i3]]$N,n=n_purity),1,3) - p_tmp <- rbind(p_tmp, tmp) - } - purity[[ee]] <- matrix(apply(p_tmp, 2, max),1,3) - - } - purity_all <- do.call(rbind, purity) - purity_all = as.data.frame(purity_all) - colnames(purity_all) = c("min_abs_corr","mean_abs_corr","median_abs_corr") - if (is.null(median_abs_corr)) { - is_pure = which(purity_all[,1] >= min_abs_corr) - } else { - is_pure = which(purity_all[,1] >= min_abs_corr | purity_all[,3] >= median_abs_corr) - } - if (length(is_pure) > 0){ - # add coloc - merged_colocsets <- coloc_sets_merged[is_pure] - names(merged_colocsets) <- paste0("merged_cos", 1:length(is_pure)) - out_cos$cos$cos <- c( out_cos$cos$cos, - merged_colocsets) - out_cos$cos$purity <- rbind( out_cos$cos$purity, - purity_all[is_pure, ]) - out_cos$cos$evidence_strength <- c( out_cos$cos$evidence_strength, - rep(0.95,length(is_pure))) - cs_change_tmp <- do.call(rbind, cs_change_merged)[is_pure, , drop=FALSE] - colnames(cs_change_tmp) <- paste0("change_obj_", 1:cb_obj$cb_model_para$L) - rownames(cs_change_tmp) <- names(merged_colocsets) - out_cos$cos$cs_change <- rbind( out_cos$cos$cs_change, - cs_change_tmp) - out_cos$cos$avWeight <- c( out_cos$cos$avWeight, - avWeight_merged[is_pure]) - out_cos$cos$coloc_outcomes <- c( out_cos$cos$coloc_outcomes, - coloc_outcomes_merged[is_pure]) - } - - # - remove single - if (length(is_merged) == length(ucos_each)){ - out_ucos$ucos_each <- NULL - out_ucos$avW_ucos_each <- NULL - out_ucos$change_obj_each <- NULL - out_ucos$purity_each <- NULL - } else { - out_ucos$ucos_each <- ucos_each[-is_merged] - out_ucos$avW_ucos_each <- out_ucos$avW_ucos_each[, -is_merged, drop = FALSE] - out_ucos$change_obj_each <- change_obj_each[-is_merged, , drop = FALSE] - out_ucos$purity_each <- out_ucos$purity_each[-is_merged, , drop = FALSE] - } - } - } - ll <- list("ucos" = out_ucos, "cos" = out_cos) - return(ll) -} - - -get_vcp <- function(past_out, P){ - if (!is.null(past_out$cos$cos$cos)){ - avW_coloc_vcp <- sapply(past_out$cos$cos$avWeight, get_integrated_weight) - } else {avW_coloc_vcp <- NULL} - all_weight <- avW_coloc_vcp - if (length(all_weight) == P){ - all_weight <- as.matrix(unlist(all_weight)) - } - if (!is.null(all_weight)){ - all_weight <- apply(all_weight, 2, as.numeric) - all_weight <- as.matrix(all_weight) - vcp <- as.vector(1 - apply(1 - all_weight,1,prod)) - } else { - vcp <- rep(0, P) - } - return(vcp) -} - - -get_pip <- function(past_out, R, P){ - if (length(past_out$cos$cos$cos)!=0){ - av_coloc <- do.call(cbind, past_out$cos$cos$avWeight) - } else {av_coloc = NULL} - if (length(past_out$ucos$ucos_each)!=0){ - av_noncoloc <- past_out$ucos$avW_ucos_each - tmp <- do.call(rbind, strsplit(colnames(av_noncoloc), ":")) - colnames(av_noncoloc) <- paste0("outcome", gsub("[^0-9.]+", "", tmp[,2])) - } else {av_noncoloc = NULL} - av_all <- cbind(av_coloc, av_noncoloc) - pip <- vector(mode='list', length=R) - if (!is.null(av_all)){ - av_name <- colnames(av_all) - for (i in 1:R){ - pos <- grep(i, av_name) - if (length(pos) != 0){ - av_i <- as.matrix(av_all[, pos]) - pip[[i]] <- as.vector(1 - apply(1-av_i,1,prod)) - } else { - pip[[i]] <- rep(0,P) - } - } - } - return(pip) -} - -check_two_overlap_sets <- function(total, i, j){ - t1 <- total[[i]] - t2 <- total[[j]] - if (t1 != "ONE" & t2 != "ONE"){ - return(ifelse(t1>t2, i, j)) - } else if (t1 == "ONE" & t2 != "ONE"){ - return(i) - } else if (t1 != "ONE" & t2 == "ONE"){ - return(j) - } else if (t1 == "ONE" & t2 == "ONE"){ - return(sample(c(i,j), 1)) - } -} - - -# Function to merge overlapping sets -merge_sets <- function(vec) { - split_lists <- lapply(vec, function(x) as.numeric(unlist(strsplit(x, ";")))) - result <- list() - while (length(split_lists) > 0) { - current <- split_lists[[1]] - split_lists <- split_lists[-1] - repeat { - overlap_index <- NULL - for (i in seq_along(split_lists)) { - if (length(intersect(current, split_lists[[i]])) > 0) { - overlap_index <- i - break - } - } - if (!is.null(overlap_index)) { - current <- union(current, split_lists[[overlap_index]]) - split_lists <- split_lists[-overlap_index] - } else { - break - } - } - result <- c(result, list(paste(sort(current), collapse = ";"))) - } - return(result) -} - - - -get_avWeigth <- function(cb_model, coloc_outcomes, update, pos.coloc, name_weight=FALSE){ - - avWeight <- lapply(coloc_outcomes, function(i){ - pos <- which(update[i,] != 0) - weight <- cb_model[[i]]$weights_path[match(pos.coloc, pos), ] - }) - avWeight <- do.call(cbind, avWeight) - if (name_weight) { - colnames(avWeight) <- paste0("outcome", coloc_outcomes) - } - return(avWeight) - -} - - - -get_max_profile <- function(cb_obj, check_null_max=0.02, check_null_method = "profile"){ - for (i in 1:cb_obj$cb_model_para$L){ - cb <- cb_obj$cb_model[[i]] - scaling_factor <- if(!is.null(cb_obj$cb_data$data[[i]]$N)) cb_obj$cb_data$data[[i]]$N-1 else 1 - if (check_null_method == "profile"){ - cb$check_null_max <- 1000*check_null_max / scaling_factor - } else { - cb$check_null_max <- check_null_max - } - cb_obj$cb_model[[i]] <- cb - } - return(cb_obj) -} - - -### Function for check cs for each weight -w_cs <- function(w, coverage = 0.95){ - n <- sum(cumsum(sort(w,decreasing = TRUE)) < coverage) + 1 - o <- order(w,decreasing = TRUE) - result = rep(0,length(w)) - result[o[1:n]] = 1 - return(result) -} - - - diff --git a/R/colocboost_assemble.R b/R/colocboost_assemble.R index cbee657..289d2ff 100644 --- a/R/colocboost_assemble.R +++ b/R/colocboost_assemble.R @@ -7,11 +7,11 @@ #' #' Un-colocalization signal - `colocboost_assemble_ucos` - identify the causal confidence sets for each outcome only. #' -#' Add-hoc merging functions including +#' Add-hoc merge_cos functions including #' #' \itemize{ -#' \item{merge_coloc_single}{merge the colocalized sets and the single causal set if pass the \code{between_purity}} -#' \item{merge_single}{merge the single causal sets for different outcomes if pass the \code{between_purity}} +#' \item{merge_coloc_single}{merge the colocalized sets and the single causal set if pass the \code{median_cos_abs_corr}} +#' \item{merge_single}{merge the single causal sets for different outcomes if pass the \code{median_cos_abs_corr}} #' } #' #' Refine of the colocalization sets (TO-DO-LIST) @@ -21,8 +21,7 @@ #' @export colocboost_assemble <- function(cb_obj, coverage = 0.95, - func_intw = "fun_R", - alpha = 1.5, + weight_fudge_factor = 1.5, check_null = 0.1, check_null_method = "profile", check_null_max=2e-5, @@ -30,12 +29,12 @@ colocboost_assemble <- function(cb_obj, overlap = TRUE, n_purity = 100, min_abs_corr = 0.5, - coverage_singlew = 0.8, + sec_coverage_thresh = 0.8, median_abs_corr = NULL, - between_cluster = 0.8, - between_purity = 0.8, - weaker_ucos = TRUE, - merging = TRUE, + min_cluster_corr = 0.8, + median_cos_abs_corr = 0.8, + weaker_effect = TRUE, + merge_cos = TRUE, tol = 1e-9, output_level = 1){ @@ -47,7 +46,7 @@ colocboost_assemble <- function(cb_obj, model_info <- get_model_info(cb_obj, outcome_names = data_info$outcome_info$outcome_names) if (data_info$n_outcomes == 1 & output_level == 1){ output_level = 2 } if (cb_obj$cb_model_para$num_updates == 1){ - cb_output <- list("cos_summary" = NULL, + cb_output <- list("cos_summary" = NULL, "vcp" = NULL, "cos_details" = NULL, "data_info" = data_info, @@ -64,8 +63,7 @@ colocboost_assemble <- function(cb_obj, cb_output <- cb_output[c("cos_summary", "vcp", "cos_details", "data_info", "model_info", "ucos_details", "diagnostic_details")] } if (data_info$n_outcome == 1){ - cb_output <- list("ucos_summary" = NULL, "pip" = NULL, - "ucos_details" = NULL, "data_info" = data_info) + cb_output <- list("ucos_summary", "pip" = NULL, "ucos_details" = NULL, "data_info" = data_info) } } @@ -79,23 +77,22 @@ colocboost_assemble <- function(cb_obj, # --------- about colocalized confidence sets --------------------------------- out_cos <- colocboost_assemble_cos(cb_obj, coverage = coverage, - func_intw = func_intw, - alpha = alpha, + weight_fudge_factor = weight_fudge_factor, check_null = check_null, check_null_method = check_null_method, dedup = dedup, overlap = overlap, n_purity = n_purity, min_abs_corr = min_abs_corr, - coverage_singlew = coverage_singlew, + sec_coverage_thresh = sec_coverage_thresh, median_abs_corr = median_abs_corr, - between_cluster = between_cluster, - between_purity = between_purity, + min_cluster_corr = min_cluster_corr, + median_cos_abs_corr = median_cos_abs_corr, tol = tol) # --------- about non-colocalized confidence sets --------------------------------- L <- cb_obj$cb_model_para$L - if (L == 1){ weaker_ucos = FALSE } + if (L == 1){ weaker_effect = FALSE } update <- cb_obj$cb_model_para$update_status ucos_each <- list() change_obj_each <- purity_each <- vector(mode='list', length=L) @@ -135,9 +132,9 @@ colocboost_assemble <- function(cb_obj, n_purity = n_purity, min_abs_corr = min_abs_corr, median_abs_corr = median_abs_corr, - between_cluster = between_cluster, - between_purity = between_purity, - weaker_ucos = weaker_ucos, + min_cluster_corr = min_cluster_corr, + median_cos_abs_corr = median_cos_abs_corr, + weaker_effect = weaker_effect, tol = tol) aaa <- out_ucos_each$ucos$ucos if (length(aaa) != 0){ @@ -168,7 +165,7 @@ colocboost_assemble <- function(cb_obj, if ( length(out_cos$cos$cos)!=0 & length(out_ucos$ucos_each)!=0){ past_out <- merge_cos_ucos(cb_obj, out_cos, out_ucos, coverage = coverage, min_abs_corr = min_abs_corr,tol = tol, - between_purity = between_purity) + median_cos_abs_corr = median_cos_abs_corr) } else if (length(out_cos$cos$cos)!=0 & length(out_ucos$ucos_each)==0) { past_out <- list("ucos" = NULL, "cos" = out_cos) } else if (length(out_cos$cos$cos)==0 & length(out_ucos$ucos_each)!=0){ @@ -178,29 +175,34 @@ colocboost_assemble <- function(cb_obj, } # ----- Merge two ucos sets if (length(past_out$ucos$ucos_each) > 1){ - if (merging) { + if (merge_cos) { past_out <- merge_ucos(cb_obj, past_out, min_abs_corr = min_abs_corr, median_abs_corr = median_abs_corr, n_purity = n_purity, - between_purity = between_purity, + median_cos_abs_corr = median_cos_abs_corr, tol = tol) } } ############# - extract colocboost output - #################### # - colocalization results - cb_obj$cb_model_para$alpha <- alpha + cb_obj$cb_model_para$weight_fudge_factor <- weight_fudge_factor cb_obj$cb_model_para$coverage <- coverage + if (cb_obj$cb_model_para$M==1){ + # fixme for single iteration model + cb_obj <- get_max_profile(cb_obj, check_null_max=0.01, check_null_method = "profile") + } cos_results <- get_cos_details(cb_obj, coloc_out = past_out$cos$cos, data_info = data_info) cb_output <- list("vcp" = cos_results$vcp, "cos_details" = cos_results$cos_results, "data_info" = data_info, "model_info" = model_info) + class(cb_output) <- "colocboost" ### - extract summary table - target_idx <- cb_obj$cb_model_para$target_idx - summary_table <- get_cos_summary(cb_output, target_outcome = data_info$outcome_info$outcome_names[target_idx]) + target_outcome_idx <- cb_obj$cb_model_para$target_outcome_idx + summary_table <- get_cos_summary(cb_output) cb_output <- c(cb_output, list(cos_summary = summary_table)) cb_output <- cb_output[c("cos_summary", "vcp", "cos_details", "data_info", "model_info")] @@ -223,7 +225,7 @@ colocboost_assemble <- function(cb_obj, if (!is.null(cb_output$ucos_details$ucos)){ cb_output$pip <- apply(do.call(cbind,cb_output$ucos_details$ucos_weight), 1, function(w0) 1-prod(1-w0)) names(cb_output$pip) <- data_info$variables - cb_output$ucos_summary <- get_summary_table_fm(cb_output) + cb_output$ucos_summary <- get_ucos_summary(cb_output) } else { tmp <- list("pip" = NULL, "ucos_summary" = NULL) cb_output <- c(cb_output, tmp) @@ -232,7 +234,6 @@ colocboost_assemble <- function(cb_obj, } } } - return(cb_output) } diff --git a/R/colocboost_assemble_cos.R b/R/colocboost_assemble_cos.R index 4ed9996..8e28cdf 100644 --- a/R/colocboost_assemble_cos.R +++ b/R/colocboost_assemble_cos.R @@ -1,18 +1,17 @@ #' @importFrom stats as.dist cutree hclust colocboost_assemble_cos <- function(cb_obj, coverage = 0.95, - func_intw = "fun_R", - alpha = 1.5, + weight_fudge_factor = 1.5, check_null = 0.1, check_null_method = "profile", dedup = TRUE, overlap = TRUE, n_purity = 100, min_abs_corr = 0.5, - coverage_singlew = 0.8, + sec_coverage_thresh = 0.8, median_abs_corr = NULL, - between_cluster = 0.8, - between_purity = 0.8, + min_cluster_corr = 0.8, + median_cos_abs_corr = 0.8, tol = 1e-9){ if (!inherits(cb_obj, "colocboost")){ @@ -43,7 +42,7 @@ colocboost_assemble_cos <- function(cb_obj, X_dict <- cb_data$dict[coloc_outcomes[iiii]] tmp <- w_purity(avWeight[,iiii,drop=FALSE], X=cb_data$data[[X_dict]]$X,Xcorr=cb_data$data[[X_dict]]$XtX, - N=cb_data$data[[coloc_outcomes[iiii]]]$N, n=n_purity, coverage = coverage_singlew, + N=cb_data$data[[coloc_outcomes[iiii]]]$N, n=n_purity, coverage = sec_coverage_thresh, min_abs_corr = min_abs_corr, median_abs_corr = median_abs_corr, miss_idx = cb_data$data[[coloc_outcomes[iiii]]]$variable_miss) check_purity[iiii] <- length(tmp) == 1 @@ -54,8 +53,8 @@ colocboost_assemble_cos <- function(cb_obj, pos_purity <- which(check_purity) avWeight <- avWeight[,pos_purity,drop=FALSE] coloc_outcomes <- coloc_outcomes[pos_purity] - weights <- get_integrated_weight(avWeight, func_intw = func_intw, alpha = alpha) - coloc_cos <- get_in_csets(weights, coverage = coverage) + weights <- get_integrated_weight(avWeight, weight_fudge_factor = weight_fudge_factor) + coloc_cos <- get_in_cos(weights, coverage = coverage) evidence_strength <- sum(weights[coloc_cos[[1]]]) # ----- null filtering @@ -144,7 +143,7 @@ colocboost_assemble_cos <- function(cb_obj, X_dict <- cb_data$dict[coloc_outcomes[iiii]] tmp <- w_purity(avWeight[,iiii,drop=FALSE], X=cb_data$data[[X_dict]]$X,Xcorr=cb_data$data[[X_dict]]$XtX, - N=cb_data$data[[coloc_outcomes[iiii]]]$N, n=n_purity, coverage = coverage_singlew, + N=cb_data$data[[coloc_outcomes[iiii]]]$N, n=n_purity, coverage = sec_coverage_thresh, min_abs_corr = min_abs_corr, median_abs_corr = median_abs_corr, miss_idx = cb_data$data[[coloc_outcomes[iiii]]]$variable_miss) check_purity[iiii] <- length(tmp) == 1 @@ -155,8 +154,8 @@ colocboost_assemble_cos <- function(cb_obj, pos_purity <- which(check_purity) avWeight <- avWeight[,pos_purity,drop=FALSE] coloc_outcomes <- coloc_outcomes[pos_purity] - weights <- get_integrated_weight(avWeight, func_intw = func_intw, alpha = alpha) - coloc_cos <- get_in_csets(weights, coverage = coverage) + weights <- get_integrated_weight(avWeight, weight_fudge_factor = weight_fudge_factor) + coloc_cos <- get_in_cos(weights, coverage = coverage) evidence_strength <- sum(weights[coloc_cos[[1]]]) # ----- null filtering @@ -189,7 +188,7 @@ colocboost_assemble_cos <- function(cb_obj, # Hierachical Clustering iteration based on sequenced weights cormat = get_cormat(t(weight_coloc)) hc = hclust(as.dist(1-cormat)) - n_cluster = get_n_cluster(hc, cormat, between_cluster = between_cluster)$n_cluster + n_cluster = get_n_cluster(hc, cormat, min_cluster_corr = min_cluster_corr)$n_cluster index = cutree(hc,n_cluster) B = sapply(1:n_cluster, function(t) as.numeric(index==t)) B <- as.matrix(B) @@ -205,7 +204,7 @@ colocboost_assemble_cos <- function(cb_obj, X_dict <- cb_data$dict[coloc_outcomes[iiii]] tmp <- w_purity(weight_cluster, X=cb_data$data[[X_dict]]$X,Xcorr=cb_data$data[[X_dict]]$XtX, - N=cb_data$data[[coloc_outcomes[iiii]]]$N, n=n_purity, coverage = coverage_singlew, + N=cb_data$data[[coloc_outcomes[iiii]]]$N, n=n_purity, coverage = sec_coverage_thresh, min_abs_corr = min_abs_corr, median_abs_corr = median_abs_corr, miss_idx = cb_data$data[[coloc_outcomes[iiii]]]$variable_miss) check_purity[[iiii]] <- tmp @@ -241,8 +240,8 @@ colocboost_assemble_cos <- function(cb_obj, for (i.w in 1:length(avWeight_coloc)){ w <- avWeight_coloc[[i.w]] if (sum(w) != 0){ - weights <- get_integrated_weight(w, func_intw = func_intw, alpha = alpha) - csets <- get_in_csets(weights, coverage = coverage) + weights <- get_integrated_weight(w, weight_fudge_factor = weight_fudge_factor) + csets <- get_in_cos(weights, coverage = coverage) coloc_cos[[fl]] <- unlist(csets) evidence_strength[[fl]] <- sum(weights[coloc_cos[[fl]]]) avWeight_csets[[fl]] <- w @@ -325,7 +324,7 @@ colocboost_assemble_cos <- function(cb_obj, - # --- filter 2*: remove overlap confidence sets based on between_purity + # --- filter 2*: remove overlap confidence sets based on median_cos_abs_corr if (length(coloc_sets) >= 2){ if (overlap){ @@ -351,7 +350,7 @@ colocboost_assemble_cos <- function(cb_obj, ave_between[i.between, j.between] <- ave_between[j.between, i.between] <- res[3] } } - is.between <- (min_between>min_abs_corr) * (abs(max_between-1)between_purity) + is.between <- (min_between>min_abs_corr) * (abs(max_between-1)median_cos_abs_corr) if (sum(is.between) != 0){ temp <- sapply(1:nrow(is.between), function(x){ tt <- c(x, which(is.between[x,] != 0)) diff --git a/R/colocboost_assemble_ucos.R b/R/colocboost_assemble_ucos.R index 7339852..a363de6 100644 --- a/R/colocboost_assemble_ucos.R +++ b/R/colocboost_assemble_ucos.R @@ -9,9 +9,9 @@ colocboost_assemble_ucos <- function(cb_obj_single, n_purity = 100, min_abs_corr = 0.5, median_abs_corr = NULL, - between_cluster = 0.8, - between_purity = 0.5, - weaker_ucos = TRUE, + min_cluster_corr = 0.8, + median_cos_abs_corr = 0.5, + weaker_effect = TRUE, tol = 1e-9){ if (!inherits(cb_obj_single, "colocboost")){ @@ -43,7 +43,7 @@ colocboost_assemble_ucos <- function(cb_obj_single, res_temp <- check_null_post(cb_obj_single, confidence_sets, coloc_outcomes=1, check_null=check_null, check_null_method = check_null_method, - weaker_ucos = weaker_ucos) + weaker_effect = weaker_effect) cs_change <- res_temp$cs_change is_non_null <- res_temp$is_non_null @@ -113,7 +113,7 @@ colocboost_assemble_ucos <- function(cb_obj_single, # Hierachical Clustering iteration based on weights cormat = get_cormat(t(weights)) hc = hclust(as.dist(1-cormat)) - n_cluster = get_n_cluster(hc, cormat, between_cluster = between_cluster)$n_cluster + n_cluster = get_n_cluster(hc, cormat, min_cluster_corr = min_cluster_corr)$n_cluster # n_cluster = 6 index = cutree(hc,n_cluster) B = sapply(1:n_cluster, function(t) as.numeric(index==t)) @@ -169,7 +169,7 @@ colocboost_assemble_ucos <- function(cb_obj_single, res_temp <- check_null_post(cb_obj_single, confidence_sets, coloc_outcomes=1, check_null=check_null, check_null_method = check_null_method, - weaker_ucos = weaker_ucos) + weaker_effect = weaker_effect) cs_change <- res_temp$cs_change is_non_null <- res_temp$is_non_null if (length(is_non_null) > 0) { @@ -218,7 +218,7 @@ colocboost_assemble_ucos <- function(cb_obj_single, } } - # --- filter 2*: remove overlap confidence sets based on between_purity + # --- filter 2*: remove overlap confidence sets based on median_cos_abs_corr if (length(confidence_sets) >= 2){ if (overlap){ @@ -239,7 +239,7 @@ colocboost_assemble_ucos <- function(cb_obj_single, ave_between[i.between, j.between] <- ave_between[j.between, i.between] <- res[3] } } - is.between <- (min_between>min_abs_corr) * (abs(max_between-1)between_purity) + is.between <- (min_between>min_abs_corr) * (abs(max_between-1)median_cos_abs_corr) if (sum(is.between) != 0){ # potential_merged <- find_merged_csets(is.between) temp <- sapply(1:nrow(is.between), function(x){ diff --git a/R/colocboost_check_update_jk.R b/R/colocboost_check_update_jk.R index ce922ac..b185a4d 100644 --- a/R/colocboost_check_update_jk.R +++ b/R/colocboost_check_update_jk.R @@ -11,37 +11,37 @@ #' @export colocboost_check_update_jk <- function(cb_model, cb_model_para, cb_data, prioritize_jkstar = TRUE, - jk_equiv_cor = 0.8, ##### more than 2 traits + jk_equiv_corr = 0.8, ##### more than 2 traits jk_equiv_loglik = 1, ## more than 2 traits func_compare = "min_max", ##### more than 3 traits - coloc_thres = 0.1 + coloc_thresh = 0.1 ){ pos.update <- which(cb_model_para$update_y == 1) - target_idx <- cb_model_para$target_idx - if (is.null(target_idx)){ + target_outcome_idx <- cb_model_para$target_outcome_idx + if (is.null(target_outcome_idx)){ cb_model_para <- boost_check_update_jk_notarget(cb_model, cb_model_para, cb_data, prioritize_jkstar = prioritize_jkstar, - jk_equiv_cor = jk_equiv_cor, + jk_equiv_corr = jk_equiv_corr, jk_equiv_loglik = jk_equiv_loglik, func_compare = func_compare, - coloc_thres = coloc_thres) + coloc_thresh = coloc_thresh) } else { - if (target_idx %in% pos.update){ + if (target_outcome_idx %in% pos.update){ cb_model_para <- boost_check_update_jk_target(cb_model, cb_model_para, cb_data, prioritize_jkstar = prioritize_jkstar, - jk_equiv_cor = jk_equiv_cor, + jk_equiv_corr = jk_equiv_corr, jk_equiv_loglik = jk_equiv_loglik, func_compare = func_compare, - coloc_thres = coloc_thres, - target_idx = target_idx) + coloc_thresh = coloc_thresh, + target_outcome_idx = target_outcome_idx) } else { cb_model_para <- boost_check_update_jk_notarget(cb_model, cb_model_para, cb_data, prioritize_jkstar = prioritize_jkstar, - jk_equiv_cor = jk_equiv_cor, + jk_equiv_corr = jk_equiv_corr, jk_equiv_loglik = jk_equiv_loglik, func_compare = func_compare, - coloc_thres = coloc_thres) + coloc_thresh = coloc_thresh) } } return(cb_model_para) @@ -53,10 +53,10 @@ colocboost_check_update_jk <- function(cb_model, cb_model_para, cb_data, #' @importFrom stats median boost_check_update_jk_notarget <- function(cb_model, cb_model_para, cb_data, prioritize_jkstar = TRUE, - jk_equiv_cor = 0.8, ##### more than 2 traits + jk_equiv_corr = 0.8, ##### more than 2 traits jk_equiv_loglik = 1, ## more than 2 traits func_compare = "min_max", ##### more than 3 traits - coloc_thres = 0.1 + coloc_thresh = 0.1 ){ @@ -68,8 +68,8 @@ boost_check_update_jk_notarget <- function(cb_model, cb_model_para, cb_data, update_status <- rep(0, cb_model_para$L) update_jk <- rep(NA, cb_model_para$L+1) real_update_jk <- rep(NA, cb_model_para$L) - if (is.null(cb_model_para$coloc_thres)) - cb_model_para$coloc_thres <- (1-coloc_thres)*max(sapply(1:length(cb_model), function(i)max(cb_model[[i]]$change_loglike))) + if (is.null(cb_model_para$coloc_thresh)) + cb_model_para$coloc_thresh <- (1-coloc_thresh)*max(sapply(1:length(cb_model), function(i)max(cb_model[[i]]$change_loglike))) # - update only Ys which is not stop pos.update <- which(cb_model_para$update_y == 1) @@ -110,7 +110,7 @@ boost_check_update_jk_notarget <- function(cb_model, cb_model_para, cb_data, model_update = model_update, cb_data = cb_data, X_dict = X_dict, - jk_equiv_cor = jk_equiv_cor, + jk_equiv_corr = jk_equiv_corr, jk_equiv_loglik = jk_equiv_loglik) if (all(judge_each)){ @@ -134,7 +134,7 @@ boost_check_update_jk_notarget <- function(cb_model, cb_model_para, cb_data, model_update = model_update, cb_data = cb_data, X_dict = X_dict, - jk_equiv_cor = jk_equiv_cor, + jk_equiv_corr = jk_equiv_corr, jk_equiv_loglik = jk_equiv_loglik) pos <- which(colSums(change_each_pair) != 0) if (length(pos) == length(pos.update)){ @@ -188,7 +188,7 @@ boost_check_update_jk_notarget <- function(cb_model, cb_model_para, cb_data, pos <- which(judge_each) # for jk = jk_each pos_not <- which(!judge_each) # for jk != jk_each check_coloc <- sapply(pos_not, function(i){ - check_temp <- model_update[[i]]$change_loglike[jk] > cb_model_para$coloc_thres + check_temp <- model_update[[i]]$change_loglike[jk] > cb_model_para$coloc_thresh return(check_temp) }) @@ -217,7 +217,7 @@ boost_check_update_jk_notarget <- function(cb_model, cb_model_para, cb_data, model_update = model_update[pos_index], cb_data = cb_data, X_dict = X_dict[pos_index], - jk_equiv_cor = jk_equiv_cor, + jk_equiv_corr = jk_equiv_corr, jk_equiv_loglik = jk_equiv_loglik) pos_similar <- pos_not[which(colSums(change_each_pair) != 0)] # -- we only need to update paired Y_i at jk_i with higher change of loglikelihood @@ -289,7 +289,7 @@ boost_check_update_jk_notarget <- function(cb_model, cb_model_para, cb_data, model_update = model_update[pos_not], cb_data = cb_data, X_dict = X_dict[pos_not], - jk_equiv_cor = jk_equiv_cor, + jk_equiv_corr = jk_equiv_corr, jk_equiv_loglik = jk_equiv_loglik) pos_similar <- pos_not[which(colSums(change_each_pair) != 0)] @@ -346,11 +346,11 @@ boost_check_update_jk_notarget <- function(cb_model, cb_model_para, cb_data, boost_check_update_jk_target <- function(cb_model, cb_model_para, cb_data, prioritize_jkstar = TRUE, - jk_equiv_cor = 0.8, ##### more than 2 traits + jk_equiv_corr = 0.8, ##### more than 2 traits jk_equiv_loglik = 1, ## more than 2 traits func_compare = "min_max", ##### more than 3 traits - coloc_thres = 0.1, - target_idx = 1 + coloc_thresh = 0.1, + target_outcome_idx = 1 ){ ############# Output ################# @@ -361,8 +361,8 @@ boost_check_update_jk_target <- function(cb_model, cb_model_para, cb_data, update_status <- rep(0, cb_model_para$L) update_jk <- rep(NA, cb_model_para$L+1) real_update_jk <- rep(NA, cb_model_para$L) - if (is.null(cb_model_para$coloc_thres)) - cb_model_para$coloc_thres <- (1-coloc_thres)*max(sapply(1:length(cb_model), function(i)max(cb_model[[i]]$change_loglike))) + if (is.null(cb_model_para$coloc_thresh)) + cb_model_para$coloc_thresh <- (1-coloc_thresh)*max(sapply(1:length(cb_model), function(i)max(cb_model[[i]]$change_loglike))) # - update only Ys which is not stop pos.update <- which(cb_model_para$update_y == 1) @@ -393,7 +393,7 @@ boost_check_update_jk_target <- function(cb_model, cb_model_para, cb_data, jk_temp <- which(temp == max(temp)) return(ifelse(length(jk_temp) == 1, jk_temp, sample(jk_temp,1))) }) - pp_target <- which(pos.update == target_idx) + pp_target <- which(pos.update == target_outcome_idx) jk_target <- jk_each[pp_target] # - if jk_target missing in all other traits @@ -409,7 +409,7 @@ boost_check_update_jk_target <- function(cb_model, cb_model_para, cb_data, remain_jk = 1:cb_model_para$P) }) # ----- second, if within the same LD buddies, select the following variants - if (max(ld)>jk_equiv_cor){ + if (max(ld)>jk_equiv_corr){ cor_target <- abs_cor_vals_each[, pp_target] order_cor <- order(cor_target, decreasing = TRUE) jk_target_tmp <- setdiff(order_cor, variable_missing)[1] @@ -417,7 +417,7 @@ boost_check_update_jk_target <- function(cb_model, cb_model_para, cb_data, ld_tmp <- get_LD_jk1_jk2(jk_target, jk_target_tmp, XtX = cb_data$data[[X_dict[pp_target]]]$XtX, remain_jk = 1:cb_model_para$P) - if (ld_tmp>jk_equiv_cor){ + if (ld_tmp>jk_equiv_corr){ jk_target <- jk_target_tmp jk_each[pp_target] <- jk_target } @@ -431,7 +431,7 @@ boost_check_update_jk_target <- function(cb_model, cb_model_para, cb_data, model_update = model_update, cb_data = cb_data, X_dict = X_dict, - jk_equiv_cor = jk_equiv_cor, + jk_equiv_corr = jk_equiv_corr, jk_equiv_loglik = jk_equiv_loglik) judge_notarget <- judge_each[-pp_target] @@ -457,7 +457,7 @@ boost_check_update_jk_target <- function(cb_model, cb_model_para, cb_data, # -- all strongest signals for non-target are different from the strongest signal for target trait check_coloc <- sapply(pos.update, function(i){ - check_temp <- cb_model[[i]]$change_loglike[jk_target] > cb_model_para$coloc_thres + check_temp <- cb_model[[i]]$change_loglike[jk_target] > cb_model_para$coloc_thresh return(check_temp) }) check_coloc <- check_coloc[-pp_target] @@ -486,7 +486,7 @@ boost_check_update_jk_target <- function(cb_model, cb_model_para, cb_data, pos <- which(judge_each) # for jk_target = jk_each pos_not <- which(!judge_each) # for jk_target != jk_each check_coloc <- sapply(pos_not, function(i){ - check_temp <- model_update[[i]]$change_loglike[jk_target] > cb_model_para$coloc_thres + check_temp <- model_update[[i]]$change_loglike[jk_target] > cb_model_para$coloc_thresh return(check_temp) }) @@ -558,7 +558,7 @@ check_jk_jkeach <- function(jk, jk_each, pos.update, model_update, cb_data, X_dict, - jk_equiv_cor = 0.8, + jk_equiv_corr = 0.8, jk_equiv_loglik = 0.001){ data_update = cb_data$data[pos.update] @@ -576,7 +576,7 @@ check_jk_jkeach <- function(jk, jk_each, XtX = cb_data$data[[X_dict[i]]]$XtX, N = data_update[[i]]$N, remain_jk = setdiff(1:length(model_update[[i]]$res), data_update[[i]]$variable_miss)) - judge[i] <- (change_each <= jk_equiv_loglik) & (abs(LD_temp) >= jk_equiv_cor) + judge[i] <- (change_each <= jk_equiv_loglik) & (abs(LD_temp) >= jk_equiv_corr) } else { judge[i] <- FALSE } @@ -591,7 +591,7 @@ check_pair_jkeach <- function(jk_each, pos.update, model_update, cb_data, X_dict, - jk_equiv_cor = 0.8, + jk_equiv_corr = 0.8, jk_equiv_loglik = 0.001){ data_update = cb_data$data[pos.update] @@ -612,7 +612,7 @@ check_pair_jkeach <- function(jk_each, XtX = cb_data$data[[X_dict[i]]]$XtX, N = data_update[[i]]$N, remain_jk = setdiff(1:length(model_update[[i]]$res), data_update[[i]]$variable_miss)) - change_each_pair[i,j] <- (change_each <= jk_equiv_loglik) & (abs(LD_temp) >= jk_equiv_cor) + change_each_pair[i,j] <- (change_each <= jk_equiv_loglik) & (abs(LD_temp) >= jk_equiv_corr) } else { change_each_pair[i,j] <- FALSE } diff --git a/R/colocboost_inference.R b/R/colocboost_inference.R new file mode 100644 index 0000000..8e0da1c --- /dev/null +++ b/R/colocboost_inference.R @@ -0,0 +1,434 @@ +#' @title Set of functions for post inferences from ColocBoost +#' +#' @description +#' The `colocboost_post_inference` functions access basic properties inferences from a fitted ColocBoost model. This documentation serves as a summary for all related post-inference functions. +#' +#' +#' @details +#' The following functions are included in this set: +#' `get_abc` get the colocalization summary table with or without the specific outcomes. +#' +#' These functions are not exported individually and are accessed via `colocboost_post_inference`. +#' +#' @rdname colocboost_post_inference +#' @keywords cb_post_inference +#' @export +colocboost_post_inference <- function() { + message("This function post inferences of colocboost output. See details for more information.") +} + + +# - Fast calculate correlation matrix +get_cormat <- function(X, intercepte = FALSE){ + X = t(X) + # Center each variable + if (!intercepte){ + X = X - rowMeans(X) + } + # Standardize each variable + X = X / sqrt(rowSums(X^2)) + # Calculate correlations + cr = tcrossprod(X) + return(cr) +} + + +#' @importFrom utils head tail +check_null_post <- function(cb_obj, + coloc_sets_temp, + coloc_outcomes, + check_null = 0.1, + check_null_method = "profile", + weaker_effect = TRUE){ + + extract_last <- function(lst) { tail(lst, n = 1)} + extract_first <- function(lst) { head(lst, n = 1)} + + get_profile <- function(cs_beta, X = NULL, Y = NULL, N = NULL, + XtX = NULL, YtY = NULL, XtY = NULL, miss_idx, adj_dep = 1){ + + if (!is.null(X)){ + mean((Y-X%*%as.matrix(cs_beta))^2) *N/(N-1) + } else if (!is.null(XtY)){ + scaling_factor <- if (!is.null(N)) (N - 1) else 1 + yty <- YtY / scaling_factor + xtx <- XtX + if (length(miss_idx)!=0){ + xty <- XtY[-miss_idx] / scaling_factor + cs_beta <- cs_beta[-miss_idx] + } else { xty <- XtY / scaling_factor } + + ( yty - 2*sum(cs_beta*xty) + sum( (xtx %*% as.matrix(cs_beta)) * cs_beta ) ) * adj_dep + } + + } + + get_cs_obj <- function(cs_beta, res, tau, func_simplex, lambda, adj_dep, LD_free, + X = NULL, Y = NULL, N = NULL, + XtX = NULL, YtY = NULL, XtY = NULL, miss_idx = NULL){ + + correlation <- get_correlation(X = X, res = res, XtY = XtY, N = N, YtY = YtY, + XtX = XtX, beta_k = cs_beta, miss_idx = miss_idx) + z <- get_z(correlation, n=N, res) + abs_cor <- abs(correlation) + jk <- which(abs_cor == max(abs_cor)) + jk <- ifelse(length(jk) == 1, jk, sample(jk,1)) + P <- length(z) + ld_jk <- get_LD_jk(jk, X = X, XtX = XtX, N = N, + remain_idx = setdiff(1:P, miss_idx), P = P) + ld_feature <- sqrt(abs(ld_jk)) + # - calculate delta + delta <- boost_KL_delta(z = z, ld_feature = ld_feature, adj_dep=adj_dep, + func_simplex = func_simplex, lambda = lambda) + scaling_factor <- if (!is.null(N)) (N - 1) else 1 + cov_Xtr <- if(!is.null(X)){ + t(X)%*%res / scaling_factor + } else { + res / scaling_factor + } + obj_ld <- if (LD_free) ld_feature else rep(1, length(ld_feature)) + obj_ld[miss_idx] <- 0 + exp_term <- adj_dep * obj_ld * (abs(cov_Xtr)) + return(tau * matrixStats::logSumExp(exp_term / tau + log(delta))) + + } + + update_res <- function(X = NULL, Y = NULL, XtX = NULL, XtY = NULL, N = NULL, cs_beta, miss_idx){ + if (!is.null(X)){ + return(Y - X%*%cs_beta) + } else if (!is.null(XtX)){ + scaling.factor <- if (!is.null(N)) N-1 else 1 + xtx <- XtX / scaling.factor + if (length(miss_idx)!=0){ + xty <- XtY[-miss_idx] / scaling.factor + res.tmp <- rep(0, length(XtY)) + res.tmp[-miss_idx] <- xty - xtx%*%cs_beta[-miss_idx] + } else { + xty <- XtY / scaling.factor + res.tmp <- xty - xtx%*%cs_beta + } + return(res.tmp) + } + } + # - add hoc + cut <- if(length(cb_obj$cb_data)==1) 0.2 else 1 + + # ----- null filtering + cb_data <- cb_obj$cb_data + cs_change <- check_cs_change <- matrix(0, nrow = length(coloc_sets_temp), ncol = cb_obj$cb_model_para$L) + colnames(cs_change) <- colnames(check_cs_change) <- paste0("change_obj_", 1:cb_obj$cb_model_para$L) + max_change <- matrix(0, nrow = length(coloc_sets_temp), ncol = cb_obj$cb_model_para$L) + for (i in 1:length(coloc_sets_temp)){ + cs_variants <- as.numeric(unlist(coloc_sets_temp[[i]])) + for (j in coloc_outcomes){ + cs_beta <- cb_obj$cb_model[[j]]$beta + cs_beta[cs_variants] <- 0 + X_dict <- cb_data$dict[j] + adj_dep <- cb_data$data[[j]]$dependency + if (check_null_method == "profile"){ + cs_profile <- get_profile(cs_beta, X = cb_data$data[[X_dict]]$X, Y = cb_data$data[[j]]$Y, + XtX = cb_data$data[[X_dict]]$XtX, XtY = cb_data$data[[j]]$XtY, + YtY = cb_data$data[[j]]$YtY, N = cb_data$data[[j]]$N, + miss_idx = cb_data$data[[j]]$variable_miss, adj_dep = adj_dep) + last_profile <- extract_last(cb_obj$cb_model[[j]]$profile_loglike_each) + change <- abs(cs_profile - last_profile) + # - add hoc + if (min(cb_obj$cb_model[[j]]$multi_correction_univariate[cs_variants]) >= cut){ change <- 0 } + # ------ + # - check_null + check_cs_change[i,j] <- change / diff(range(cb_obj$cb_model[[j]]$profile_loglike_each)) + cs_change[i,j] <- change # / diff(range(cb_obj$cb_model[[j]]$profile_loglike_each)) + first_profile <- extract_first(cb_obj$cb_model[[j]]$profile_loglike_each) + max_change[i,j] <- (first_profile - last_profile) >= cb_obj$cb_model[[j]]$check_null_max + } else if (check_null_method == "obj"){ + res <- update_res(X = cb_data$data[[X_dict]]$X, Y = cb_data$data[[j]]$Y, + XtX = cb_data$data[[X_dict]]$XtX, XtY = cb_data$data[[j]]$XtY, + N = cb_data$data[[j]]$N, cs_beta, + miss_idx = cb_data$data[[j]]$variable_miss) + cs_obj <- get_cs_obj(cs_beta, res, cb_obj$cb_model_para$tau, cb_obj$cb_model_para$func_simplex, + cb_obj$cb_model_para$lambda, adj_dep = adj_dep, + LD_free = cb_obj$cb_model_para$LD_free, + X = cb_data$data[[X_dict]]$X, N = cb_data$data[[j]]$N, + XtX = cb_data$data[[X_dict]]$XtX, XtY = cb_data$data[[X_dict]]$XtY, + YtY = cb_data$data[[X_dict]]$YtY, + miss_idx = cb_data$data[[j]]$variable_miss) + last_obj <- min(cb_obj$cb_model[[j]]$obj_path) + change <- abs(cs_obj - last_obj) + if (length(cb_obj$cb_model[[j]]$obj_path)==1){ + total_obj <- 1 + } else { + total_obj <- diff(range(cb_obj$cb_model[[j]]$obj_path)) + } + check_cs_change[i,j] <- change / total_obj + cs_change[i,j] <- change + max_change[i,j] <- total_obj >= cb_obj$cb_model[[j]]$check_null_max + } + } + } + if (!weaker_effect){ + check_cs_change <- cs_change + check_null_tmp <- sapply(1:cb_obj$cb_model_para$L, function(j) cb_obj$cb_model[[j]]$check_null_max) + } else { check_null_tmp <- rep(check_null, cb_obj$cb_model_para$L) } + cs_change <- as.data.frame(cs_change * max_change) # changed + for (ii in 1:nrow(check_cs_change)){ + cs_tmp <- check_cs_change[ii,] + check_cs_change[ii,] <- (cs_tmp>check_null_tmp) + } + is_non_null <- which(rowSums( check_cs_change * max_change ) != 0) + + ll = list("cs_change" = cs_change, + "is_non_null" = is_non_null) + return(ll) +} + +#' @importFrom stats na.omit +get_purity <- function(pos, X=NULL, Xcorr=NULL, N = NULL, n = 100) { + get_upper_tri = Rfast::upper_tri + get_median = Rfast::med + + if (sum(is.na(pos))!=0){ + pos <- as.numeric(na.omit(pos)) + } + + if (length(pos) == 1) + return(c(1,1,1)) + else { + + # Subsample the columns if necessary. + if (length(pos) > n) + pos = sample(pos,n) + + if (is.null(Xcorr)) { + X_sub = X[,pos] + X_sub = as.matrix(X_sub) + corr <- suppressWarnings({get_cormat(X_sub)}) + corr[which(is.na(corr))] = 0 + value = abs(get_upper_tri(corr)) + } else { + Xcorr <- Xcorr # if (!is.null(N)) Xcorr/(N-1) else Xcorr + value = abs(get_upper_tri(Xcorr[pos,pos])) + } + return(c(min(value), + sum(value)/length(value), + get_median(value))) + } +} + +# ------ Calculate modularity ---------- +get_modularity <- function(Weight, B){ + if (dim(Weight)[1] == 1){ + Q <- 0 + } else { + W_pos <- Weight * (Weight > 0) + W_neg <- Weight * (Weight < 0) + N <- dim(Weight)[1] + K_pos <- colSums(W_pos) + K_neg <- colSums(W_neg) + m_pos <- sum(K_pos) + m_neg <- sum(K_neg) + m <- m_pos + m_neg + cate <- B %*% t(B) + if (m_pos == 0 & m_neg == 0){ + Q <- 0 + } else { + if (m_pos == 0){ + Q_positive <- 0 + Q_negative <- sum((W_neg - K_neg %*% t(K_neg) / m_neg) * cate) / m_neg + } else if (m_neg == 0){ + Q_positive <- sum((W_pos - K_pos %*% t(K_pos) / m_pos) * cate) / m_pos + Q_negative <- 0 + } else { + Q_positive <- sum((W_pos - K_pos %*% t(K_pos) / m_pos) * cate) / m_pos + Q_negative <- sum((W_neg - K_neg %*% t(K_neg) / m_neg) * cate) / m_neg + } + } + Q <- m_pos / m * Q_positive - m_neg / m * Q_negative + } +} + +#' @importFrom stats cutree +get_n_cluster <- function(hc, Sigma, m=ncol(Sigma), min_cluster_corr = 0.8){ + if (min(Sigma) > min_cluster_corr){ + IND = 1 + Q = 1 + } else { + Q <- c() + if (ncol(Sigma) < 10){m = ncol(Sigma)} + for(i in 1:m) + { + index=cutree(hc,i) + B=sapply(1:i, function(t) as.numeric(index==t)) + Q[i] <- get_modularity(Sigma, B) + } + + IND=which(Q==max(Q)) + L=length(IND) + if (L>1) IND=IND[1] + } + return(list("n_cluster" = IND, + "Qmodularity" = Q)) +} + +w_purity <- function(weights, X=NULL, Xcorr=NULL, N = NULL, n = 100, coverage = 0.95, + min_abs_corr = 0.5, median_abs_corr = NULL, miss_idx = NULL){ + + tmp <- apply(weights, 2, w_cs, coverage = coverage) + tmp_purity <- apply(tmp, 2, function(tt){ + pos <- which(tt == 1) + # deal with missing snp here + if (!is.null(Xcorr)){ + pos <- match(pos, setdiff(1:length(tmp), miss_idx)) + } + get_purity(pos, X=X, Xcorr=Xcorr, N=N, n = n) + }) + if (is.null(median_abs_corr)) { + is_pure = which(tmp_purity[1,] >= min_abs_corr) + } else { + is_pure = which(tmp_purity[1,] >= min_abs_corr | tmp_purity[3,] >= median_abs_corr) + } + return(is_pure) +} + + +#' @importFrom stats na.omit +# - Calculate purity between two confidence sets +get_between_purity <- function(pos1, pos2, X=NULL, Xcorr=NULL, N = NULL, miss_idx = NULL, P = NULL){ + + get_matrix_mult <- function(X_sub1, X_sub2){ + + X_sub1 = t(X_sub1) + X_sub2 = t(X_sub2) + # Standardize each variable + X_sub1 = X_sub1 / sqrt(rowSums(X_sub1^2)) + X_sub1[is.nan(X_sub1)] = 0 + X_sub2 = X_sub2 / sqrt(rowSums(X_sub2^2)) + X_sub2[is.nan(X_sub2)] = 0 + # Calculate correlations + cr = tcrossprod(X_sub1, X_sub2) + return(cr) + + } + get_median = Rfast::med + + if (is.null(Xcorr)){ + X_sub1 = X[,pos1,drop=FALSE] + X_sub2 = X[,pos2,drop=FALSE] + value <- abs(get_matrix_mult(X_sub1, X_sub2)) + + } else { + pos1 <- na.omit(match(pos1, setdiff(1:P, miss_idx))) + pos2 <- na.omit(match(pos2, setdiff(1:P, miss_idx))) + # scaling_factor <- if (!is.null(N)) N-1 else 1 + if (length(pos1)!=0 & length(pos2)!=0){ + value = abs(Xcorr[pos1,pos2]) # / scaling_factor + } else { + value = 0 + } + + } + return(c(min(value), max(value), get_median(value))) +} + +#' @importFrom stats var +#' @importFrom utils tail +get_cos_evidence <- function(cb_obj, coloc_out, data_info){ + + get_cos_config <- function(w, config_idx, weight_fudge_factor = 1.5, coverage = 0.95){ + + w_outcome <- colnames(w) + config_outcome <- paste0("outcome", config_idx) + pos <- which(w_outcome %in% config_outcome) + w_config <- w[, pos, drop=FALSE] + int_w <- get_integrated_weight(w_config, weight_fudge_factor = weight_fudge_factor) + unlist(get_in_cos(int_w, coverage = coverage)) + + } + + get_cos_profile <- function(cs_beta, outcome_idx, X = NULL, Y = NULL, N = NULL, + XtX = NULL, YtY = NULL, XtY = NULL, miss_idx = NULL, adj_dep = 1){ + + if (!is.null(X)){ + cos_profile <- mean((Y-X%*%as.matrix(cs_beta))^2) *N/(N-1) + yty <- var(Y) + } else if (!is.null(XtY)){ + scaling_factor <- if (!is.null(N)) (N - 1) else 1 + yty <- YtY / scaling_factor + xtx <- XtX + if (length(miss_idx)!=0){ + xty <- XtY[-miss_idx] / scaling_factor + cs_beta <- cs_beta[-miss_idx] + } else { xty <- XtY / scaling_factor } + + cos_profile <- ( yty - 2*sum(cs_beta*xty) + sum( (xtx %*% as.matrix(cs_beta)) * cs_beta ) ) * adj_dep + } + delta <- yty - cos_profile + if (delta <= 0){ + warning(paste("Warning message: potential sumstat & LD mismatch may happens for outcome", outcome_idx, + ". Using logLR = CoS(profile) - max(profile). Please check our website.")) + } + cos_profile + } + + # - + get_outcome_profile_change <- function(outcome_idx, cos, cb_obj, check_null_max){ + extract_last <- function(lst) { tail(lst, n = 1)} + cb_data <- cb_obj$cb_data + cs_beta <- rep(0, cb_obj$cb_model_para$P) + cs_beta[cos] <- cb_obj$cb_model[[outcome_idx]]$beta[cos] + X_dict <- cb_data$dict[outcome_idx] + cos_profile <- get_cos_profile(cs_beta, outcome_idx, + X = cb_data$data[[X_dict]]$X, Y = cb_data$data[[outcome_idx]]$Y, + XtX = cb_data$data[[X_dict]]$XtX, XtY = cb_data$data[[outcome_idx]]$XtY, + YtY = cb_data$data[[outcome_idx]]$YtY, N = cb_data$data[[outcome_idx]]$N, + miss_idx = cb_data$data[[outcome_idx]]$variable_miss, + adj_dep = cb_data$data[[outcome_idx]]$dependency) + max_profile <- max(cb_obj$cb_model[[outcome_idx]]$profile_loglike_each) + ifelse(max_profile < cos_profile, 0, max_profile - cos_profile) + } + + # - calculate best configuration likelihood explained by minimal configuration + get_normalization_evidence <- function(profile_change, null_max, outcomes, outcome_names) { + # Define the exponential likelihood ratio normalization (ELRN) + logLR_normalization <- function(ratio) { 1 - exp( - 2*ratio ) } + + ratio <- profile_change / null_max + prob <- logLR_normalization(ratio) + df <- data.frame(outcomes_index = outcomes, relative_logLR = ratio, npc_outcome = prob) + rownames(df) <- outcome_names[outcomes] + sorted_df <- df[order(-df$relative_logLR), ] + return(sorted_df) + } + + get_npuc <- function(npc_outcome){ + max_idx <- which.max(npc_outcome) + npc_outcome[max_idx] * prod(1 - npc_outcome[-max_idx]) + } + + avWeight <- coloc_out$avWeight + cs_change <- coloc_out$cs_change + check_null_max <- sapply(cb_obj$cb_model, function(cb) cb$check_null_max) + outcome_names <- data_info$outcome_info$outcome_names + n_cos <- length(avWeight) + npc <- c() + normalization_evidence <- list() + for (i in 1:n_cos){ + w <- avWeight[[i]] + outcomes <- coloc_out$coloc_outcomes[[i]] + # most likely cos + cos <- get_cos_config(w, outcomes,weight_fudge_factor = cb_obj$cb_model_para$weight_fudge_factor,coverage = cb_obj$cb_model_para$coverage) + profile_change_outcome <- sapply(outcomes, function(outcome_idx){ + get_outcome_profile_change(outcome_idx, cos, cb_obj, check_null_max) + }) + normalization_evidence[[i]] <- get_normalization_evidence(profile_change = profile_change_outcome, + null_max = check_null_max[outcomes], + outcomes, outcome_names) + + # - calcualte normalization probability of colocalization (NPC) + npuc <- get_npuc(normalization_evidence[[i]]$npc_outcome) + npc[i] <- 1 - npuc + } + names(normalization_evidence) <- names(npc) <- names(coloc_out$cos) + return(list(normalization_evidence=normalization_evidence, npc=npc)) +} + + diff --git a/R/colocboost_init.R b/R/colocboost_init.R index 6b80aa0..2cc6104 100644 --- a/R/colocboost_init.R +++ b/R/colocboost_init.R @@ -32,9 +32,9 @@ colocboost_inits <- function() { colocboost_init_data <- function(X, Y, dict_YX, Z, LD, N_sumstat, dict_sumstatLD, Var_y, SeBhat, - keep.variables = NULL, - target_idx = NULL, - target_variables = TRUE, + keep_variables = NULL, + target_outcome_idx = NULL, + target_outcome_variables = TRUE, overlap_variables = FALSE, intercept=TRUE, standardize=TRUE, residual_correlation = NULL){ @@ -49,25 +49,25 @@ colocboost_init_data <- function(X, Y, dict_YX, } else if (is.null(dict_YX) & !is.null(dict_sumstatLD)){ dict <- dict_sumstatLD } - if (target_variables & !is.null(target_idx)){ - if (target_idx > length(dict)){ + if (target_outcome_variables & !is.null(target_outcome_idx)){ + if (target_outcome_idx > length(dict)){ stop("Target outcome index is over the total number of outcomes! please check!") } - keep.variable.names <- keep.variables[[dict[target_idx]]] + keep_variable_names <- keep_variables[[dict[target_outcome_idx]]] if (overlap_variables){ - keep.tmp <- lapply(keep.variables[-dict[target_idx]], function(tm){ - intersect(keep.variable.names, tm) + keep_tmp <- lapply(keep_variables[-dict[target_outcome_idx]], function(tm){ + intersect(keep_variable_names, tm) }) - keep.variable.names <- Reduce(union, keep.tmp) + keep_variable_names <- Reduce(union, keep_tmp) } } else { if (overlap_variables){ - keep.variable.names <- Reduce(intersect, keep.variables) + keep_variable_names <- Reduce(intersect, keep_variables) } else { - keep.variable.names <- Reduce(union, keep.variables) + keep_variable_names <- get_merge_ordered_with_indices(keep_variables) } } - cb_data$variable.names <- keep.variable.names + cb_data$variable.names <- keep_variable_names flag <- 1 # if individual: X, Y if (!is.null(X) & !is.null(Y)){ @@ -103,14 +103,14 @@ colocboost_init_data <- function(X, Y, dict_YX, } } - # - if missing X - FIXME - variable.name <- keep.variables[[dict_YX[i]]] - if (length(variable.name) != length(keep.variable.names)){ - x_extend <- matrix(0, nrow = nrow(x_tmp), ncol = length(keep.variable.names), - dimnames = list(rownames(x_tmp), keep.variable.names)) - variable.tmp <- intersect(keep.variable.names, variable.name) - pos <- match(variable.tmp, keep.variable.names) - tmp$variable_miss <- setdiff(1:length(keep.variable.names), pos) + # - if missing X + variable.name <- keep_variables[[dict_YX[i]]] + if (length(variable.name) != length(keep_variable_names)){ + x_extend <- matrix(0, nrow = nrow(x_tmp), ncol = length(keep_variable_names), + dimnames = list(rownames(x_tmp), keep_variable_names)) + variable.tmp <- intersect(keep_variable_names, variable.name) + pos <- match(variable.tmp, keep_variable_names) + tmp$variable_miss <- setdiff(1:length(keep_variable_names), pos) poss <- match(variable.tmp, variable.name) x_extend[,pos] <- x_tmp[,poss] x_tmp <- x_extend @@ -152,35 +152,30 @@ colocboost_init_data <- function(X, Y, dict_YX, # check LD if (dict_sumstatLD[i] == i){ - # --- final rank: keep.variable.names (N > M) N / M - # --- current list: final_variables (M = M1 n M2) <- LD (M1 x M1), Z (M2 x 1) >> LD (M x M) # - intersect variables - tmp_variables <- intersect(keep.variables[[flag1]], colnames(LD[[i]])) - pos.final <- sort(match(tmp_variables, keep.variable.names)) - final_variables <- keep.variable.names[pos.final] # keep the same rank with keep.variable.names + tmp_variables <- intersect(keep_variables[[flag1]], colnames(LD[[i]])) + pos.final <- sort(match(tmp_variables, keep_variable_names)) + final_variables <- keep_variable_names[pos.final] # keep the same rank with keep_variable_names # - check missing variables - tmp$variable_miss <- setdiff(1:length(keep.variable.names), pos.final) + tmp$variable_miss <- setdiff(1:length(keep_variable_names), pos.final) # - set 0 to LD pos.LD <- match(final_variables, colnames(LD[[i]])) - # LD_sparse <- as(LD[[i]], "sparseMatrix") - # LD_tmp <- LD_sparse[pos.LD, pos.LD] LD_tmp <- LD[[i]][pos.LD, pos.LD] } else { - variable_i <- keep.variables[[flag1]] - variable_LD <- keep.variables[[dict_sumstatLD[i]+ind_idx]] + variable_i <- keep_variables[[flag1]] + variable_LD <- keep_variables[[dict_sumstatLD[i]+ind_idx]] if (length(which(is.na(match(variable_i, variable_LD)))) != 0){ # - intersect variables - tmp_variables <- intersect(keep.variables[[flag]], colnames(LD[[dict_sumstatLD[i]]])) - pos.final <- sort(match(tmp_variables, keep.variable.names)) - final_variables <- keep.variable.names[pos.final] # keep the same rank with keep.variable.names + tmp_variables <- intersect(keep_variables[[flag]], colnames(LD[[dict_sumstatLD[i]]])) + pos.final <- sort(match(tmp_variables, keep_variable_names)) + final_variables <- keep_variable_names[pos.final] # keep the same rank with keep_variable_names # - check missing variables - tmp$variable_miss <- setdiff(1:length(keep.variable.names), pos.final) + tmp$variable_miss <- setdiff(1:length(keep_variable_names), pos.final) # - set 0 to LD pos.LD <- match(final_variables, colnames(LD[[dict_sumstatLD[i]]])) LD_tmp <- LD[[dict_sumstatLD[i]]][pos.LD, pos.LD] - # - need to change LD dict_sumstatLD[i] <- i @@ -191,8 +186,8 @@ colocboost_init_data <- function(X, Y, dict_YX, } # - set 0 to Z - Z_extend <- rep(0, length(keep.variable.names)) # N - pos.z <- match(final_variables, keep.variables[[flag1]]) # M <- M2 + Z_extend <- rep(0, length(keep_variable_names)) # N + pos.z <- match(final_variables, keep_variables[[flag1]]) # M <- M2 Z_extend[pos.final] <- Z[[i]][pos.z] # - organize data @@ -253,14 +248,14 @@ colocboost_init_data <- function(X, Y, dict_YX, #' @noRd #' @keywords cb_objects colocboost_init_model <- function(cb_data, - step = 0.01, + learning_rate_init = 0.01, stop_thresh = 1e-06, - func_multicorrection = "lfdr", + func_multi_test = "lfdr", stop_null = 0.05, - multicorrection_max = 1, + multi_test_max = 1, ash_prior = "normal", p.adjust.methods = "fdr", - target_idx = NULL){ + target_outcome_idx = NULL){ ################# initialization ####################################### cb_model = list() @@ -279,7 +274,7 @@ colocboost_init_model <- function(cb_data, "change_loglike" = NULL, "correlation" = NULL, "z" = NULL, - "step" = step, + "learning_rate_init" = learning_rate_init, "stop_thresh" = stop_thresh, "ld_jk" = c(), "jk" = c()) @@ -308,7 +303,7 @@ colocboost_init_model <- function(cb_data, multiple_correction <- 1 } else { multiple_correction <- get_multiple_correction(z=tmp$z, miss_idx = data_each$variable_miss, - func_multicorrection = func_multicorrection, + func_multi_test = func_multi_test, ash_prior = ash_prior, p.adjust.methods = p.adjust.methods) } @@ -316,13 +311,13 @@ colocboost_init_model <- function(cb_data, tmp$multi_correction_univariate <- multiple_correction if (all(multiple_correction==1)){ tmp$stop_null <- 1 - } else if (min(multiple_correction) > multicorrection_max){ + } else if (min(multiple_correction) > multi_test_max){ tmp$stop_null <- min(multiple_correction) } else { - if (min(multiple_correction) < multicorrection_max){ - tmp$stop_null <- multicorrection_max + if (min(multiple_correction) < multi_test_max){ + tmp$stop_null <- multi_test_max } else { - tmp$stop_null <- min(min(multiple_correction)+0.1, multicorrection_max) + tmp$stop_null <- min(min(multiple_correction)+0.1, multi_test_max) } } @@ -337,13 +332,13 @@ colocboost_init_model <- function(cb_data, #' @keywords cb_objects #' @importFrom utils tail colocboost_init_para <- function(cb_data, cb_model,tau=0.01, - func_prior = "z2z", - lambda = 0.5, lambda_target = 1, - multicorrection_cut=1, - func_multicorrection = "lfdr", - LD_obj = FALSE, + func_simplex = "z2z", + lambda = 0.5, lambda_target_outcome = 1, + multi_test_thresh=1, + func_multi_test = "lfdr", + LD_free = FALSE, outcome_names = NULL, - target_idx = NULL){ + target_outcome_idx = NULL){ ################# initialization ####################################### @@ -357,7 +352,7 @@ colocboost_init_para <- function(cb_data, cb_model,tau=0.01, profile_loglike <- sum(sapply(1:length(cb_model), function(i) tail(cb_model[[i]]$profile_loglike_each, n=1))) # - check initial update outcome stop_null <- sapply(cb_model, function(tmp) min(tmp$multi_correction_univariate)) - pos_stop <- which(stop_null >= multicorrection_cut) + pos_stop <- which(stop_null >= multi_test_thresh) update_y = rep(1, L) if (length(pos_stop) != 0){ update_y[pos_stop] <- 0 } else {pos_stop = NULL} @@ -375,19 +370,19 @@ colocboost_init_para <- function(cb_data, cb_model,tau=0.01, "P" = P, "N" = N, "tau" = tau, - "func_prior" = func_prior, + "func_simplex" = func_simplex, "lambda" = lambda, - "lambda_target" = lambda_target, + "lambda_target_outcome" = lambda_target_outcome, "profile_loglike" = profile_loglike, "update_status" = c(), "jk" = c(), "update_y" = update_y, "true_stop" = pos_stop, - "LD_obj" = LD_obj, + "LD_free" = LD_free, "real_update_jk" = c(), "outcome_names" = outcome_names, "variables" = cb_data$variable.names, - "target_idx" = target_idx, + "target_outcome_idx" = target_outcome_idx, "coveraged" = TRUE, "num_updates" = 1) class(cb_model_para) = "colocboost" @@ -560,17 +555,17 @@ get_padj <- function(z, miss_idx = NULL, p.adjust.methods = "fdr"){ # test also -get_multiple_correction <- function(z, miss_idx = NULL, func_multicorrection = "lfdr", +get_multiple_correction <- function(z, miss_idx = NULL, func_multi_test = "lfdr", ash_prior = "normal", p.adjust.methods = "fdr"){ - if (func_multicorrection == "lfsr"){ + if (func_multi_test == "lfsr"){ lfsr = get_lfsr(z=z, miss_idx = miss_idx, ash_prior = ash_prior) return(lfsr) - } else if (func_multicorrection == "lfdr"){ + } else if (func_multi_test == "lfdr"){ lfdr <- get_lfdr(z = z, miss_idx = miss_idx) return(lfdr) - } else if (func_multicorrection == "padj"){ + } else if (func_multi_test == "padj"){ fdr = get_padj(z = z, miss_idx = miss_idx, p.adjust.methods = p.adjust.methods) return(fdr) } diff --git a/R/colocboost_one_causal.R b/R/colocboost_one_causal.R index c1d768a..d15c1ee 100644 --- a/R/colocboost_one_causal.R +++ b/R/colocboost_one_causal.R @@ -1,35 +1,35 @@ colocboost_one_causal <- function(cb_model, cb_model_para, cb_data, - jk_equiv_cor = 0.8, + jk_equiv_corr = 0.8, jk_equiv_loglik = 1, tau = 0.01, - decayrate = 1, - func_prior = "z2z", + learning_rate_decay = 1, + func_simplex = "z2z", lambda = 0.5, - lambda_target = 1, - LD_obj = FALSE){ + lambda_target_outcome = 1, + LD_free = FALSE){ - if (jk_equiv_cor != 0){ + if (jk_equiv_corr != 0){ cb_obj <- colocboost_one_iteration(cb_model, cb_model_para, cb_data, - jk_equiv_cor = jk_equiv_cor, + jk_equiv_corr = jk_equiv_corr, jk_equiv_loglik = jk_equiv_loglik, tau = tau, - decayrate = decayrate, - func_prior = func_prior, + learning_rate_decay = learning_rate_decay, + func_simplex = func_simplex, lambda = lambda, - lambda_target = lambda_target, - LD_obj = LD_obj) + lambda_target_outcome = lambda_target_outcome, + LD_free = LD_free) } else { cb_obj <- colocboost_diagLD(cb_model, cb_model_para, cb_data, - jk_equiv_cor = jk_equiv_cor, + jk_equiv_corr = jk_equiv_corr, jk_equiv_loglik = jk_equiv_loglik, tau = tau, - decayrate = decayrate, - func_prior = func_prior, + learning_rate_decay = learning_rate_decay, + func_simplex = func_simplex, lambda = lambda, - lambda_target = lambda_target, - LD_obj = LD_obj) + lambda_target_outcome = lambda_target_outcome, + LD_free = LD_free) } return(cb_obj) @@ -40,14 +40,14 @@ colocboost_one_causal <- function(cb_model, cb_model_para, cb_data, # under one causal per trait assumption with one iteration colocboost_one_iteration <- function(cb_model, cb_model_para, cb_data, - jk_equiv_cor = 0.8, + jk_equiv_corr = 0.8, jk_equiv_loglik = 1, tau = 0.01, - decayrate = 1, - func_prior = "z2z", + learning_rate_decay = 1, + func_simplex = "z2z", lambda = 0.5, - lambda_target = 1, - LD_obj = FALSE){ + lambda_target_outcome = 1, + LD_free = FALSE){ if (sum(cb_model_para$update_y == 1) != 0){ @@ -55,7 +55,7 @@ colocboost_one_iteration <- function(cb_model, cb_model_para, cb_data, ######## - some traits updated # - step 1: check update clusters real_update <- boost_check_update_jk_one_causal(cb_model, cb_model_para, cb_data, - jk_equiv_cor = jk_equiv_cor, + jk_equiv_corr = jk_equiv_corr, jk_equiv_loglik = jk_equiv_loglik) # - step 2: boost update @@ -68,11 +68,11 @@ colocboost_one_iteration <- function(cb_model, cb_model_para, cb_data, # - update cb_model cb_model <- colocboost_update(cb_model, cb_model_para, cb_data, tau = tau, - decayrate = decayrate, - func_prior = func_prior, + learning_rate_decay = learning_rate_decay, + func_simplex = func_simplex, lambda = lambda, - lambda_target = lambda_target, - LD_obj = LD_obj) + lambda_target_outcome = lambda_target_outcome, + LD_free = LD_free) } } # -- remove redundant parameters @@ -94,7 +94,7 @@ colocboost_one_iteration <- function(cb_model, cb_model_para, cb_data, boost_check_update_jk_one_causal <- function(cb_model, cb_model_para, cb_data, prioritize_jkstar = TRUE, - jk_equiv_cor = 0.8, + jk_equiv_corr = 0.8, jk_equiv_loglik = 1){ pos.update <- which(cb_model_para$update_y == 1) @@ -132,7 +132,7 @@ boost_check_update_jk_one_causal <- function(cb_model, cb_model_para, cb_data, model_update = model_update, cb_data = cb_data, X_dict = X_dict, - jk_equiv_cor = jk_equiv_cor, + jk_equiv_corr = jk_equiv_corr, jk_equiv_loglik = jk_equiv_loglik) # define category with same jk temp <- sapply(1:nrow(change_each_pair), function(x){ @@ -168,14 +168,14 @@ boost_check_update_jk_one_causal <- function(cb_model, cb_model_para, cb_data, # under one causal per trait assumption with no LD colocboost_diagLD <- function(cb_model, cb_model_para, cb_data, - jk_equiv_cor = 0, + jk_equiv_corr = 0, jk_equiv_loglik = 0.1, tau = 0.01, - decayrate = 1, - func_prior = "z2z", + learning_rate_decay = 1, + func_simplex = "z2z", lambda = 0.5, - lambda_target = 1, - LD_obj = FALSE){ + lambda_target_outcome = 1, + LD_free = FALSE){ if (sum(cb_model_para$update_y == 1) == 1){ @@ -196,8 +196,8 @@ colocboost_diagLD <- function(cb_model, cb_model_para, cb_data, cb_model_para$update_status <- cbind(cb_model_para$update_status, as.matrix(update_status)) cb_model_para$real_update_jk <- rbind(cb_model_para$real_update_jk, real_update_jk) cb_model <- colocboost_update(cb_model, cb_model_para, cb_data, - tau = tau, decayrate = decayrate, func_prior = func_prior, - lambda = lambda, lambda_target = lambda_target, LD_obj = LD_obj) + tau = tau, learning_rate_decay = learning_rate_decay, func_simplex = func_simplex, + lambda = lambda, lambda_target_outcome = lambda_target_outcome, LD_free = LD_free) } @@ -226,8 +226,8 @@ colocboost_diagLD <- function(cb_model, cb_model_para, cb_data, "real_update_jk" = real_update_jk) # - update cb_model cb_model_tmp <- colocboost_update(cb_model_tmp, cb_model_para, cb_data, - tau = tau, decayrate = decayrate, func_prior = func_prior, - lambda = lambda, lambda_target = lambda_target, LD_obj = LD_obj) + tau = tau, learning_rate_decay = learning_rate_decay, func_simplex = func_simplex, + lambda = lambda, lambda_target_outcome = lambda_target_outcome, LD_free = LD_free) weights <- rbind(weights, cb_model_tmp[[iy]]$weights_path) } ###### overlap weights @@ -237,7 +237,7 @@ colocboost_diagLD <- function(cb_model, cb_model_para, cb_data, model_update = model_update, cb_data = cb_data, X_dict = X_dict, - jk_equiv_cor = jk_equiv_cor, + jk_equiv_corr = jk_equiv_corr, jk_equiv_loglik = jk_equiv_loglik) change_each_pair <- change_each_pair * overlap_pair # define category with same jk @@ -277,11 +277,11 @@ colocboost_diagLD <- function(cb_model, cb_model_para, cb_data, # - update cb_model cb_model <- colocboost_update(cb_model, cb_model_para, cb_data, tau = tau, - decayrate = decayrate, - func_prior = func_prior, + learning_rate_decay = learning_rate_decay, + func_simplex = func_simplex, lambda = lambda, - lambda_target = lambda_target, - LD_obj = LD_obj) + lambda_target_outcome = lambda_target_outcome, + LD_free = LD_free) } } # -- remove redundant parameters @@ -305,11 +305,11 @@ check_pair_overlap <- function(weights, coverage = 0.95){ overlap_pair <- matrix(NA, nrow = nrow(weights), ncol = nrow(weights)) for (i in 1:nrow(weights)){ w1 <- weights[i,] - cos1 <- get_in_csets(w1, coverage = coverage)[[1]] + cos1 <- get_in_cos(w1, coverage = coverage)[[1]] for (j in 1:nrow(weights)){ if (j != i){ w2 <- weights[j,] - cos2 <- get_in_csets(w2, coverage = coverage)[[1]] + cos2 <- get_in_cos(w2, coverage = coverage)[[1]] overlap <- intersect(cos1, cos2) if ( length(overlap)!=0 ){ diff --git a/R/colocboost_output.R b/R/colocboost_output.R index 7cfd51c..cbc8b0f 100644 --- a/R/colocboost_output.R +++ b/R/colocboost_output.R @@ -1,285 +1,41 @@ -#' @title Set of functions for post inferences from ColocBoost +#' @rdname get_cos_summary #' -#' @description -#' The `colocboost_get_methods` functions access basic properties inferences from a fitted ColocBoost model. This documentation serves as a summary for all related post-inference functions. +#' @title Get colocalization summary table from a ColocBoost output. #' +#' @description `get_cos_summary` get the colocalization summary table with or without the outcomes of interest. #' -#' @details -#' The following functions are included in this set: -#' `get_summary_table` get the colocalization summary table with or without the specific outcomes. +#' @param cb_output Output object from `colocboost` analysis +#' @param outcome_names Optional vector of names of outcomes, which has the same order as Y in the original analysis. +#' @param interest_outcome Optional vector specifying a subset of outcomes from \code{outcome_names} to focus on. When provided, only colocalization events that include at least one of these outcomes will be returned. +#' @param gene_name Optional character string. When provided, adds a column with this gene name to the output table for easier filtering in downstream analyses. +#' +#' @return A summary table for colocalization events with the following columns: +#' \item{target_outcome}{The target outcome being analyzed if exists. Otherwise, it is \code{FALSE}.} +#' \item{colocalized_outcomes}{Colocalized outcomes for colocalization confidence set (CoS) } +#' \item{cos_id}{Unique identifier for colocalization confidence set (CoS) } +#' \item{purity}{Minimum absolute correlation of variables with in colocalization confidence set (CoS) } +#' \item{top_variable}{The variable with highest variant colocalization probability (VCP) } +#' \item{top_variable_vcp}{Variant colocalization probability for the top variable} +#' \item{cos_npc}{Normalized probability of colocalization} +#' \item{min_npc_outcome}{Minimum normalized probability of colocalized traits} +#' \item{n_variables}{Number of variables in colocalization confidence set (CoS)} +#' \item{colocalized_index}{Indices of colocalized variables} +#' \item{colocalized_variables}{List of colocalized variables} +#' \item{colocalized_variables_vcp}{Variant colocalization probabilities for all colocalized variables} #' -#' These functions are not exported individually and are accessed via `colocboost_get_methods`. +#' @examples +#' get_cos_summary(cb_output) +#' get_cos_summary(cb_output, interest_outcome = c("Y1", "Y2")) #' -#' @rdname colocboost_get_methods -#' @keywords cb_post_inference +#' @keywords cb_get_functions #' @export -colocboost_get_methods <- function() { - message("This function post inferences of colocboost output. See details for more information.") -} - -#' @noRd -#' @keywords cb_post_inference -get_data_info <- function(cb_obj){ - - ## - analysis data information - n_outcome <- cb_obj$cb_model_para$L - n_variables <- cb_obj$cb_model_para$P - analysis_outcome <- cb_obj$cb_model_para$outcome_names - variables <- cb_obj$cb_data$variable.names - target_outcome <- NULL - is_target <- rep(FALSE, n_outcome) - if (!is.null(cb_obj$cb_model_para$target_idx)){ - target_outcome <- analysis_outcome[cb_obj$cb_model_para$target_idx] - is_target[cb_obj$cb_model_para$target_idx] <- TRUE - } - is_sumstat <- grepl("sumstat_outcome", names(cb_obj$cb_data$data)) - outcome_info <- data.frame( - "outcome_names" = analysis_outcome, "sample_size" = cb_obj$cb_model_para$N, - "is_sumstats" = is_sumstat, "is_target" = is_target ) - rownames(outcome_info) <- paste0("y", 1:n_outcome) - - ## - marginal associations - z_scores <- lapply(cb_obj$cb_model, function(cb){ as.numeric(cb$z_univariate) }) - betas <- lapply(cb_obj$cb_model, function(cb){ as.numeric(cb$beta) }) - names(z_scores) <- names(betas) <- analysis_outcome - if (all(grepl("chr", variables))){ - # need to check with AI - # chr:pos:a1:a2 - # 1:pos:a1 - # chr_pos_ - # chr.pos - # y1: A, B, D, F, G - # y2: B, C, D, E - # A,B,C,D,E,F,G - - # - if variables_name has position informtaion, re-order all_info based on positions - # position <- as.numeric(sapply(variables, function(tmp) strsplit(tmp, ":")[[1]][2])) - position <- as.numeric(gsub(".*:(\\d+).*", "\\1", variables)) - ordered <- order(position, decreasing = FALSE) - z_scores <- lapply(z_scores, function(z) z[ordered]) - variables <- variables[ordered] - } - - ## - output data info - data.info <- list("n_outcomes" = n_outcome, - "n_variables" = n_variables, - "outcome_info" = outcome_info, - "variables" = variables, - "coef" = betas, - "z" = z_scores) - return(data.info) - -} - - - -#' @noRd -#' @keywords cb_post_inference -get_cos_details <- function(cb_obj, coloc_out, data_info = NULL){ - - if (is.null(data_info)) - data_info <- get_data_info(cb_obj) - - - ### ----- Define the colocalization results - coloc_sets <- coloc_out$cos - if (length(coloc_sets)!=0){ - - # - colocalization outcome configurations - tmp <- get_cos_evidence(cb_obj, coloc_out, data_info) - normalization_evidence <- tmp$normalization_evidence - npc <- tmp$npc - - # - colocalized outcomes - analysis_outcome <- cb_obj$cb_model_para$outcome_names - coloc_outcome_index <- coloc_outcome <- list() - colocset_names <- c() - for (i in 1:length(coloc_out$cos)){ - coloc_outcome_index[[i]] <- coloc_out$coloc_outcomes[[i]] - coloc_outcome[[i]] <- analysis_outcome[coloc_outcome_index[[i]]] - colocset_names[i] <- paste0("cos", i, ":", paste0(paste0("y", coloc_outcome_index[[i]]), collapse = "_")) - if (grepl("merged", names(coloc_sets)[i])) { - colocset_names[i] <- paste0(colocset_names[i], ":merged") - } - } - names(coloc_outcome) <- names(coloc_outcome_index) <- colocset_names - names(npc) <- names(normalization_evidence) <- colocset_names - coloc_outcomes <- list("outcome_index" = coloc_outcome_index, "outcome_name" = coloc_outcome) - - # - colocalized sets for variables - coloc_csets_variableidx <- coloc_out$cos - coloc_csets_variablenames <- lapply(coloc_csets_variableidx, function(coloc_tmp){ cb_obj$cb_model_para$variables[coloc_tmp] }) - coloc_csets_variableidx <- lapply(coloc_csets_variablenames, function(variable) match(variable, data_info$variables)) - names(coloc_csets_variableidx) <- names(coloc_csets_variablenames) <- colocset_names - coloc_csets_original <- list("cos_index" = coloc_csets_variableidx, "cos_variables" = coloc_csets_variablenames) - - # - colocalized set cs_change - cs_change <- coloc_out$cs_change - rownames(cs_change) <- colocset_names - colnames(cs_change) <- analysis_outcome - - # - VCP - cos_weights <- lapply(coloc_out$avWeight, function(w){ - pos <- match(data_info$variables, cb_obj$cb_model_para$variables) - return(w[pos, , drop = FALSE]) - }) - int_weight <- lapply(cos_weights, get_integrated_weight, alpha = cb_obj$cb_model_para$alpha) - names(int_weight) <- names(cos_weights) <- colocset_names - vcp <- as.vector(1 - apply(1 - do.call(cbind, int_weight),1,prod)) - names(vcp) <- data_info$variables - - # - resummary results - cos_re_idx <- lapply(int_weight, function(w){ unlist(get_in_csets(w, coverage = cb_obj$cb_model_para$coverage))}) - cos_re_var <- lapply(cos_re_idx, function(idx){ data_info$variables[idx] }) - coloc_csets <- list("cos_index" = cos_re_idx, "cos_variables" = cos_re_var) - - # - hits variables in each csets - coloc_hits <- coloc_hits_variablenames <- coloc_hits_names <- c() - for (i in 1:length(int_weight)){ - inw <- int_weight[[i]] - pp <- which(inw == max(inw)) - coloc_hits <- c(coloc_hits, pp) - coloc_hits_variablenames <- c(coloc_hits_variablenames, data_info$variables[pp]) - if (length(pp)==1){ - coloc_hits_names <- c(coloc_hits_names, names(int_weight)[i]) - } else { - coloc_hits_names <- c(coloc_hits_names, paste0(names(int_weight)[i], ".", 1:length(pp))) - } - } - coloc_hits <- data.frame("top_index" = coloc_hits, "top_variables" = coloc_hits_variablenames) - rownames(coloc_hits) <- coloc_hits_names - - # - purity - ncos <- length(coloc_csets$cos_index) - if (ncos >= 2){ - empty_matrix <- matrix(NA, ncos, ncos) - colnames(empty_matrix) <- rownames(empty_matrix) <- colocset_names - csets_purity <- lapply(1:3, function(ii){ - diag(empty_matrix) <- coloc_out$purity[,ii] - return(empty_matrix) - }) - for (i in 1:(ncos-1)){ - for (j in (i+1):ncos){ - cset1 <- coloc_csets$cos_index[[i]] - cset2 <- coloc_csets$cos_index[[j]] - y.i <- coloc_outcomes$outcome_index[[i]] - y.j <- coloc_outcomes$outcome_index[[j]] - yy <- unique(y.i, y.j) - res <- list() - flag <- 1 - for (ii in yy){ - X_dict <- cb_obj$cb_data$dict[ii] - res[[flag]] <- get_between_purity(cset1, cset2, X = cb_obj$cb_data$data[[X_dict]]$X, - Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, - N = cb_obj$cb_data$data[[ii]]$N, - miss_idx = cb_obj$cb_data$data[[ii]]$variable_miss, - P = cb_obj$cb_model_para$P) - flag <- flag + 1 - } - res <- Reduce(pmax, res) - csets_purity <- lapply(1:3, function(ii){ - csets_purity[[ii]][i,j] <- csets_purity[[ii]][j,i] <- res[ii] - return(csets_purity[[ii]]) - }) - } - } - names(csets_purity) <- c("min_abs_cor", "max_abs_cor", "median_abs_cor") - - } else { - csets_purity <- coloc_out$purity - rownames(csets_purity) <- colocset_names - } - - # - save coloc_results - coloc_results <- list("cos" = coloc_csets, - "cos_outcomes" = coloc_outcomes, - "cos_vcp" = int_weight, - "cos_npc" = npc, - "cos_purity" = csets_purity, - "cos_top_variables" = coloc_hits, - "cos_outcomes_npc" = normalization_evidence, - "cos_weights" = cos_weights) - - - # - missing variable and warning message - missing_variables_idx <- Reduce(union, lapply(cb_obj$cb_data$data,function(cb) cb$variable_miss )) - missing_variables <- cb_obj$cb_model_para$variables[missing_variables_idx] - cos_missing_variables_idx <- lapply(coloc_csets_original$cos_variables, function(variable){ - missing <- intersect(variable, missing_variables) - if (length(missing)!=0){ - match(missing, data_info$variables) - } else { NULL } - }) - cos_missing_variables <- lapply(cos_missing_variables_idx, function(variable){if(!is.null(variable)){data_info$variables[variable]} else {NULL}}) - warning_needed <- any(!sapply(cos_missing_variables, is.null)) - if (warning_needed){ - is_missing <- which(!sapply(cos_missing_variables, is.null)) - cos_missing_variables_idx <- cos_missing_variables_idx[is_missing] - cos_missing_variables <- cos_missing_variables[is_missing] - cos_missing_vcp <- lapply(cos_missing_variables_idx, function(idx){ vcp[idx] }) - warning_message <- paste("CoS", paste(names(cos_missing_variables_idx), collapse = ","), - "contains missing variables in at least one outcome.", - "The missing variables will cause the ~0 VCP scores.") - cos_warnings <- list("cos_missing_info" = list("index" = cos_missing_variables_idx, - "variables" = cos_missing_variables, - "vcp" = cos_missing_vcp), - "warning_message" = warning_message) - coloc_results$cos_warnings <- cos_warnings - } - - } else { - coloc_results <- NULL - vcp <- NULL - } - return(list("cos_results"=coloc_results, "vcp"=vcp)) - -} - -#' @noRd -#' @keywords cb_post_inference -get_model_info <- function(cb_obj, outcome_names = NULL){ - - if (is.null(outcome_names)){ - data_info <- get_data_info(cb_obj) - outcome_names <- data_info$outcome_info$outcome_names - } - - profile_loglik <- cb_obj$cb_model_para$profile_loglike - n_updates <- cb_obj$cb_model_para$num_updates - model_coveraged <- cb_obj$cb_model_para$coveraged - jk_update <- cb_obj$cb_model_para$real_update_jk - outcome_proximity_obj <- lapply(cb_obj$cb_model, function(cb) cb$obj_path) - outcome_coupled_obj <- lapply(cb_obj$cb_model, function(cb) cb$obj_single) - outcome_profile_loglik <- lapply(cb_obj$cb_model, function(cb) cb$profile_loglike_each) - names(outcome_proximity_obj) <- names(outcome_coupled_obj) <- - names(outcome_profile_loglik) <- outcome_names - ll <- list("model_coveraged" = model_coveraged, - "n_updates" = n_updates, - "profile_loglik" = profile_loglik, - "outcome_profile_loglik" = outcome_profile_loglik, - "outcome_proximity_obj" = outcome_proximity_obj, - "outcome_coupled_obj" = outcome_coupled_obj, - "jk_update" = jk_update) - return(ll) +get_cos_summary <- function(cb_output, + outcome_names = NULL, + interest_outcome = NULL, + gene_name = NULL){ -} - - - -#' @rdname colocboost_get_methods -#' @title Extract colocalization summary table -#' -#' @description -#' Get the colocalization summary table with or without the specific outcomes. -#' -#' @examples -#' get_summary_table(cb_output) -#' get_summary_table(cb_output, target_outcome = c("Y1", "Y2")) -#' -#' @noRd -#' @keywords cb_post_inference -get_cos_summary <- function(cb_output, outcome_names = NULL, gene_name = NULL, - target_outcome = NULL){ + if (!inherits(cb_output, "colocboost")){ + stop("Input must from colocboost output!")} coloc_csets <- cb_output$cos_details$cos$cos_index if (length(coloc_csets) != 0){ @@ -293,9 +49,9 @@ get_cos_summary <- function(cb_output, outcome_names = NULL, gene_name = NULL, } vcp <- as.numeric(cb_output$vcp) - summary_table <- matrix(NA, nrow = length(coloc_sets), ncol = 11) + summary_table <- matrix(NA, nrow = length(coloc_sets), ncol = 12) colnames(summary_table) <- c("target_outcome", "colocalized_outcomes", "cos_id", "purity", - "top_variable", "top_variable_vcp", "cos_npc", "n_variables", + "top_variable", "top_variable_vcp", "cos_npc", "min_npc_outcome", "n_variables", "colocalized_index", "colocalized_variables", "colocalized_variables_vcp") summary_table <- as.data.frame(summary_table) summary_table[,1] <- FALSE @@ -304,403 +60,124 @@ get_cos_summary <- function(cb_output, outcome_names = NULL, gene_name = NULL, summary_table[,4] <- as.numeric(diag(as.matrix(cb_output$cos_details$cos_purity$min_abs_cor))) summary_table[,5] <- unlist(sapply(cb_output$cos_details$cos$cos_variables, function(tmp) tmp[1])) summary_table[,6] <- sapply(coloc_sets, function(tmp) max(vcp[tmp])) - summary_table[,7] <- as.numeric(cb_output$cos_details$cos_npc) - summary_table[,8] <- as.numeric(sapply(coloc_sets, length)) - summary_table[,9] <- unlist(sapply(coloc_sets, function(tmp) paste0(tmp, collapse = "; "))) - summary_table[,10] <- unlist(sapply(cb_output$cos_details$cos$cos_variables, function(tmp) paste0(tmp, collapse = "; "))) - summary_table[,11] <- unlist(sapply(coloc_sets, function(tmp) paste0(vcp[tmp], collapse = "; "))) + summary_table[,7] <- round(as.numeric(cb_output$cos_details$cos_npc), 4) + summary_table[,8] <- round(as.numeric(cb_output$cos_details$cos_min_npc_outcome), 4) + summary_table[,9] <- as.numeric(sapply(coloc_sets, length)) + summary_table[,10] <- unlist(sapply(coloc_sets, function(tmp) paste0(tmp, collapse = "; "))) + summary_table[,11] <- unlist(sapply(cb_output$cos_details$cos$cos_variables, function(tmp) paste0(tmp, collapse = "; "))) + summary_table[,12] <- unlist(sapply(coloc_sets, function(tmp) paste0(vcp[tmp], collapse = "; "))) if (!is.null(gene_name)){ summary_table$gene_name <- gene_name } - - if (!is.null(target_outcome)){ + # - if target colocalization + target_outcome_idx <- which(cb_output$data_info$outcome_info$is_target) + if (length(target_outcome_idx)!=0){ + target_outcome <- analysis_outcome[target_outcome_idx] tmp <- sapply(target_outcome, function(tmp) grep(paste0(tmp, "\\b"), analysis_outcome)) - if (all(sapply(tmp, length)!=0)){ - if.target <- sapply(coloc_outcome, function(cp){ - tt <- sapply(target_outcome, function(tmp) grep(paste0(tmp, "\\b"), cp)) - all(sapply(tt, length)!=0) - }) - summary_table$target_outcome <- ifelse(if.target, target_outcome, FALSE) - summary_table=summary_table[order(summary_table$target_outcome == "FALSE"), ] - if (sum(if.target) == 0){ warnings("No colocalization with target outcomes.") } - } else { - warnings("Target outcome is not in the analysis outcomes, please check.") - } - } - } else { - summary_table <- NULL - } - return(summary_table) - -} - - -#' @noRd -#' @keywords cb_post_inference -get_full_output <- function(cb_obj, past_out = NULL, variables = NULL, cb_output = NULL, weaker_ucos = FALSE){ - - cb_model <- cb_obj$cb_model - cb_model_para <- cb_obj$cb_model_para - - ## - obtain the order of variables based on the variables names if it has position information - if (is.null(variables)){ - variables <- cb_obj$cb_model_para$variables - if (all(grepl("chr", variables))){ - # - if variables_name has position informtaion, re-order all_info based on positions - position <- as.numeric(gsub(".*:(\\d+).*", "\\1", variables)) - # position <- as.numeric(sapply(variables_name, function(tmp) strsplit(tmp, ":")[[1]][2])) - ordered <- order(position, decreasing = FALSE) - } else { - ordered <- 1:length(variables) - } - } else { - ordered <- match(variables, cb_obj$cb_model_para$variables) - } - - ## - reorder all output - # - cb_model - tmp <- lapply(cb_model, function(cb){ - cb$beta <- cb$beta[ordered] - cb$weights_path <- cb$weights_path[, ordered] - cb$change_loglike <- cb$change_loglike[ordered] - cb$correlation <- as.numeric(cb$correlation[ordered]) - cb$z <- as.numeric(cb$z[ordered]) - cb$ld_jk <- cb$ld_jk[,ordered] - cb$z_univariate <- as.numeric(cb$z_univariate[ordered]) - cb$beta_hat <- as.numeric(cb$beta_hat[ordered]) - cb$multi_correction <- as.numeric(cb$multi_correction[ordered]) - cb$multi_correction_univariate <- as.numeric(cb$multi_correction_univariate[ordered]) - return(cb) - }) - cb_model <- tmp - - # - sets - if (!is.null(past_out)){ - out_ucos <- past_out$ucos - # - single sets - if (!is.null(out_ucos$ucos_each)){ - - out_ucos$ucos_each <- lapply(out_ucos$ucos_each, function(cs){ - match(cb_model_para$variables[cs], variables) + if.target <- sapply(coloc_outcome, function(cp){ + tt <- sapply(target_outcome, function(tmp) grep(paste0(tmp, "\\b"), cp)) + all(sapply(tt, length)!=0) }) - out_ucos$avW_ucos_each <- out_ucos$avW_ucos_each[ordered,,drop=FALSE] - - # - re-orginize specific results - analysis_outcome <- cb_obj$cb_model_para$outcome_names - specific_outcome_index <- specific_outcome <- list() - specific_cs_names <- c() - for (i in 1:length(out_ucos$ucos_each)){ - cc <- out_ucos$avW_ucos_each[,i,drop=FALSE] - tmp_names <- colnames(cc) - specific_outcome_index[[i]] <- as.numeric(gsub(".*Y([0-9]+).*", "\\1", tmp_names)) - specific_outcome[[i]] <- analysis_outcome[specific_outcome_index[[i]]] - specific_cs_names[i] <- paste0("ucos", i, ":y", specific_outcome_index[[i]]) - } - names(specific_outcome) <- names(specific_outcome_index) <- specific_cs_names - specific_outcomes <- list("outcome_index" = specific_outcome_index, "outcome_name" = specific_outcome) - - # - specific sets for variables - specific_cs_variableidx <- out_ucos$ucos_each - specific_cs_variablenames <- lapply(specific_cs_variableidx, function(specific_tmp){ cb_obj$cb_model_para$variables[specific_tmp] }) - specific_cs_variableidx <- lapply(specific_cs_variablenames, function(variable) match(variable, variables)) - names(specific_cs_variableidx) <- names(specific_cs_variablenames) <- specific_cs_names - specific_css <- list("ucos_index" = specific_cs_variableidx, "ucos_variables" = specific_cs_variablenames) - - # - specific set cs_change - cs_change <- out_ucos$change_obj_each - rownames(cs_change) <- specific_cs_names - colnames(cs_change) <- analysis_outcome - index_change <- as.data.frame(which(cs_change != 0, arr.ind = TRUE)) - change_outcomes <- analysis_outcome[index_change$col] - change_values <- diag(as.matrix(cs_change[index_change$row, index_change$col])) - cs_change <- data.frame("ucos_outcome" = change_outcomes, "ucos_delta" = change_values) - - # - filter weak ucos - check_null_max <- sapply(cb_model, function(cb) cb$check_null_max) - remove_weak <- sapply(1:nrow(cs_change), function(ic){ - outcome_tmp <- cs_change$ucos_outcome[ic] - delta_tmp <- cs_change$ucos_delta[ic] - pp <- which(cb_obj$cb_model_para$outcome_names == outcome_tmp) - check_tmp <- check_null_max[pp] - delta_tmp >= check_tmp + summary_table$target_outcome <- ifelse(if.target, target_outcome, FALSE) + summary_table=summary_table[order(summary_table$target_outcome == "FALSE"), ] + if (sum(if.target) == 0){ warnings("No colocalization with target outcomes.") } + } + # - if extract only interest outcome colocalization + if (!is.null(interest_outcome)){ + tmp <- sapply(interest_outcome, function(tmp) grep(paste0(tmp, "\\b"), analysis_outcome)) + if (all(sapply(tmp, length)!=0)){ + if.interest <- sapply(coloc_outcome, function(cp){ + tt <- sapply(interest_outcome, function(tmp) grep(paste0(tmp, "\\b"), cp)) + all(sapply(tt, length)!=0) }) - keep_ucos <- which(remove_weak) - if (length(keep_ucos)==0){ - specific_results <- NULL - } else { - specific_outcomes$outcome_index <- specific_outcomes$outcome_index[keep_ucos] - specific_outcomes$outcome_name <- specific_outcomes$outcome_name[keep_ucos] - specific_css$ucos_index <- specific_css$ucos_index[keep_ucos] - specific_css$ucos_variables <- specific_css$ucos_variables[keep_ucos] - cs_change <- cs_change[keep_ucos, , drop = FALSE] - out_ucos$avW_ucos_each <- out_ucos$avW_ucos_each[, keep_ucos, drop = FALSE] - specific_cs_names <- specific_cs_names[keep_ucos] - out_ucos$purity_each <- out_ucos$purity_each[keep_ucos, , drop = FALSE] - - # - ucos_weight - specific_w <- lapply(1:ncol(out_ucos$avW_ucos_each), function(ii) out_ucos$avW_ucos_each[,ii,drop=FALSE]) - names(specific_w) <- specific_cs_names - - # - hits variables in each csets - cs_hits <- sapply(1:length(specific_w), function(jj){ inw=specific_w[[jj]]; sample(which(inw == max(inw)),1) }) - cs_hits_variablenames <- sapply(cs_hits, function(ch) variables[ch] ) - specific_cs_hits <- data.frame("top_index" = cs_hits, "top_variables" = cs_hits_variablenames) # save - rownames(specific_cs_hits) <- specific_cs_names - - # - purity - nucos <- length(specific_css$ucos_index) - if (nucos >= 2){ - empty_matrix <- matrix(NA, nucos, nucos) - colnames(empty_matrix) <- rownames(empty_matrix) <- specific_cs_names - specific_cs_purity <- lapply(1:3, function(ii){ - diag(empty_matrix) <- out_ucos$purity_each[,ii] - return(empty_matrix) - }) - for (i in 1:(nucos-1)){ - for (j in (i+1):nucos){ - cset1 <- specific_css$ucos_index[[i]] - cset2 <- specific_css$ucos_index[[j]] - y.i <- specific_outcomes$outcome_index[[i]] - y.j <- specific_outcomes$outcome_index[[j]] - yy <- unique(y.i, y.j) - res <- list() - flag <- 1 - for (ii in yy){ - X_dict <- cb_obj$cb_data$dict[ii] - res[[flag]] <- get_between_purity(cset1, cset2, X = cb_obj$cb_data$data[[X_dict]]$X, - Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, - N = cb_obj$cb_data$data[[ii]]$N, - miss_idx = cb_obj$cb_data$data[[ii]]$variable_miss, - P = cb_obj$cb_model_para$P) - flag <- flag + 1 - } - res <- Reduce(pmax, res) - specific_cs_purity <- lapply(1:3, function(ii){ - specific_cs_purity[[ii]][i,j] <- specific_cs_purity[[ii]][j,i] <- res[ii] - return(specific_cs_purity[[ii]]) - }) - } - } - names(specific_cs_purity) <- c("min_abs_cor", "max_abs_cor", "median_abs_cor") - - } else { - specific_cs_purity <- out_ucos$purity_each - rownames(specific_cs_purity) <- specific_cs_names - } - - # - cos&ucos purity - cos <- cb_output$cos_details$cos$cos_index - ncos <- length(cos) - if (ncos!=0){ - empty_matrix <- matrix(NA, ncos, nucos) - colnames(empty_matrix) <- specific_cs_names - rownames(empty_matrix) <- names(cos) - cos_ucos_purity <- lapply(1:3, function(ii) empty_matrix ) - for (i in 1:ncos){ - for (j in 1:nucos){ - cset1 <- cos[[i]] - cset2 <- specific_css$ucos_index[[j]] - y.i <- cb_output$cos_details$cos_outcomes$outcome_index[[i]] - y.j <- specific_outcomes$outcome_index[[j]] - yy <- unique(y.i, y.j) - res <- list() - flag <- 1 - for (ii in yy){ - X_dict <- cb_obj$cb_data$dict[ii] - res[[flag]] <- get_between_purity(cset1, cset2, X = cb_obj$cb_data$data[[X_dict]]$X, - Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, - N = cb_obj$cb_data$data[[ii]]$N, - miss_idx = cb_obj$cb_data$data[[ii]]$variable_miss, - P = cb_obj$cb_model_para$P) - flag <- flag + 1 - } - res <- Reduce(pmax, res) - cos_ucos_purity <- lapply(1:3, function(ii){ - cos_ucos_purity[[ii]][i,j] <- res[ii] - return(cos_ucos_purity[[ii]]) - }) - } - } - names(cos_ucos_purity) <- c("min_abs_cor", "max_abs_cor", "median_abs_cor") - } else { - cos_ucos_purity <- NULL - } - - - # - save coloc_results - specific_results <- list("ucos" = specific_css, - "ucos_outcomes" = specific_outcomes, - "ucos_weight" = specific_w, - "ucos_top_variables" = specific_cs_hits, - "ucos_purity" = specific_cs_purity, - "cos_ucos_purity" = cos_ucos_purity, - "ucos_outcomes_delta" = cs_change) - } - - } else { - specific_results <- NULL + summary_table$interest_outcome <- interest_outcome + summary_table=summary_table[-which(if.interest == "FALSE"), ] + if (sum(if.interest) == 0){ warnings("No colocalization with interest outcomes.") } + } else { + warnings("Interest outcome is not in the analysis outcomes, please check.") + } } - - # - cb_model_para - cb_model_para$N <- as.numeric(unlist(cb_model_para$N)) - cb_model_para$variables <- variables - - ll <- list("ucos_details" = specific_results, - "cb_model" = cb_model, - "cb_model_para" = cb_model_para) } else { - # - cb_model_para - cb_model_para$N <- as.numeric(unlist(cb_model_para$N)) - cb_model_para$variables <- variables - ll <- list("ucos_detials" = NULL, - "cb_model" = cb_model, - "cb_model_para" = cb_model_para) + summary_table <- NULL } - - return(ll) + return(summary_table) } -#' @noRd -#' @keywords cb_post_inference -get_summary_table_fm <- function(cb_output, outcome_names = NULL, gene_name = NULL){ - - specific_cs <- cb_output$ucos_details - if (length(specific_cs$ucos$ucos_index) != 0){ - - cs_outcome <- cb_output$data_info$outcome_info$outcome_names - if (!is.null(outcome_names)){ cs_outcome <- outcome_names } - pip <- as.numeric(cb_output$pip) - - summary_table <- matrix(NA, nrow = length(specific_cs$ucos$ucos_index), ncol = 9) - colnames(summary_table) <- c("outcomes", "ucos_id", "purity", - "top_variable", "top_variable_pip", "n_variables", "ucos_index", - "ucos_variables", "ucos_variables_pip") - summary_table <- as.data.frame(summary_table) - summary_table[,1] <- cs_outcome[unlist(specific_cs$ucos_outcomes$outcome_index)] - summary_table[,2] <- names(specific_cs$ucos$ucos_index) - summary_table[,3] <- as.numeric(diag(as.matrix(specific_cs$ucos_purity$min_abs_cor))) - summary_table[,4] <- unlist(sapply(specific_cs$ucos$ucos_variables, function(tmp) tmp[1])) - summary_table[,5] <- sapply(specific_cs$ucos$ucos_index, function(tmp) max(pip[tmp])) - summary_table[,6] <- as.numeric(sapply(specific_cs$ucos$ucos_index, length)) - summary_table[,7] <- unlist(sapply(specific_cs$ucos$ucos_index, function(tmp) paste0(tmp, collapse = "; "))) - summary_table[,8] <- unlist(sapply(specific_cs$ucos$ucos_variables, function(tmp) paste0(tmp, collapse = "; "))) - summary_table[,9] <- unlist(sapply(specific_cs$ucos$ucos_index, function(tmp) paste0(pip[tmp], collapse = "; "))) - if (!is.null(gene_name)){ summary_table$gene_name <- gene_name } - - } else { - summary_table <- NULL - } - return(summary_table) - -} - -#' @importFrom stats pchisq -cos_pvalue_filter <- function(cos_results, data_info = NULL, pvalue_cutoff = 1e-4){ +#' @rdname get_strong_colocalization +#' +#' @title Get colocalization summary table from a ColocBoost output. +#' +#' @description `get_strong_colocalization` get the colocalization by discarding the weaker colocalization events or colocalized outcomes +#' +#' @param cb_output Output object from `colocboost` analysis +#' @param cos_npc_cutoff Minimum threshold of normalized probability of colocalization (NPC) for CoS. +#' @param npc_outcome_cutoff Minimum threshold of normalized probability of colocalized traits in each CoS. +#' @param pvalue_cutoff Maximum threshold of margianl p-values of colocalized variants on colocalized traits in each CoS. +#' @param weight_fudge_factor The strenght to integrate weight from differnt outcomes, default is 1.5 +#' @param coverage A number between 0 and 1 specifying the \dQuote{coverage} of the estimated colocalization confidence sets (CoS) (default is 0.95). +#' +#' @return A \code{"colocboost"} object with some or all of the following elements: +#' +#' \item{cos_summary}{A summary table for colocalization events.} +#' \item{vcp}{The variable colocalized probability for each variable.} +#' \item{cos_details}{A object with all information for colocalization results.} +#' \item{data_info}{A object with detailed information from input data} +#' \item{model_info}{A object with detailed information for colocboost model} +#' +#' @examples +#' get_strong_colocalization(cb_output, cos_npc_cutoff = 0.5, npc_outcome_cutoff = 0.25) +#' +#' @keywords cb_get_functions +#' @export +get_strong_colocalization <- function(cb_output, + cos_npc_cutoff = 0.5, + npc_outcome_cutoff = 0.25, + pvalue_cutoff = NULL, + weight_fudge_factor = 1.5, + coverage = 0.95){ - if (is.null(data_info)) - data_info <- get_data_info(cb_obj) + if (!inherits(cb_output, "colocboost")){ + stop("Input must from colocboost object!")} - n_cos <- length(cos_results$cos_results$cos$cos_index) - filtered_cos <- list("cos_index" = NULL, - "cos_variables" = NULL) - filtered_traits <- list("outcome_index" = NULL, - "outcome_name" = NULL) - filtered_cos_vcp <- list() - pp_remove <- c() - for (i in 1:n_cos){ - cos_tmp <- cos_results$cos_results$cos$cos_index[[i]] - cos_trait <- cos_results$cos_results$cos_outcomes$outcome_index[[i]] - minPV <- sapply(cos_trait, function(tmp){ - z <- data_info$z[[tmp]] - z <- z[cos_tmp] - pv <- pchisq(z^2, 1, lower.tail = FALSE) - min(pv) - }) - pp <- which(minPV < pvalue_cutoff) - if (length(pp) == 0 | length(pp) == 1) { - pp_remove <- c(pp_remove, i) - next - } - filtered_cos$cos_index <- c(filtered_cos$cos_index, - cos_results$cos_results$cos$cos_index[i]) - filtered_cos$cos_variables <- c(filtered_cos$cos_variables, - cos_results$cos_results$cos$cos_variables[i]) - filtered_cos_vcp <- c(filtered_cos_vcp, cos_results$cos_results$cos_vcp[i]) - if (length(pp) == length(cos_trait)){ - filtered_traits$outcome_index <- c(filtered_traits$outcome_index, - cos_results$cos_results$cos_outcomes$outcome_index[i]) - filtered_traits$outcome_name <- c(filtered_traits$outcome_name, - cos_results$cos_results$cos_outcomes$outcome_name[i]) - } else { - filtered_traits$outcome_index <- c(filtered_traits$outcome_index, - list(cos_results$cos_results$cos_outcomes$outcome_index[[i]][pp])) - filtered_traits$outcome_name <- c(filtered_traits$outcome_name, - list(cos_results$cos_results$cos_outcomes$outcome_name[[i]][pp])) - } + if (is.null(cb_output$cos_details)){ + warnings("No colocalization results in this region!") + return(cb_output) } - if (length(filtered_cos$cos_index)==0){ - cos_results <- list("cos_results" = NULL, "vcp" = NULL) + + if (npc_outcome_cutoff == 0 && cos_npc_cutoff == 0 && is.null(pvalue_cutoff)){ + message("All possible colocalization events are reported regardless of their relative evidence compared to uncolocalized events (cos_npc_cutoff = 0 and npc_outcome_cutoff = 0).") + return(cb_output) } else { - cos_results$cos_results$cos <- filtered_cos - names(filtered_traits$outcome_index) <- names(cos_results$cos_results$cos$cos_index) - names(filtered_traits$outcome_name) <- names(cos_results$cos_results$cos$cos_index) - cos_results$cos_results$cos_outcomes <- filtered_traits - cos_results$cos_results$cos_vcp <- filtered_cos_vcp - coloc_hits <- coloc_hits_variablenames <- coloc_hits_names <- c() - for (i in 1:length(filtered_cos_vcp)){ - inw <- filtered_cos_vcp[[i]] - pp1 <- which(inw == max(inw)) - coloc_hits <- c(coloc_hits, pp1) - coloc_hits_variablenames <- c(coloc_hits_variablenames, data_info$variables[pp1]) - if (length(pp1)==1){ - coloc_hits_names <- c(coloc_hits_names, names(filtered_cos_vcp)[i]) - } else { - coloc_hits_names <- c(coloc_hits_names, paste0(names(filtered_cos_vcp)[i], ".", 1:length(pp1))) - } - } - coloc_hits <- data.frame("top_index" = coloc_hits, "top_variables" = coloc_hits_variablenames) - rownames(coloc_hits) <- coloc_hits_names - cos_results$cos_results$cos_top_variables <- coloc_hits - - # - change purity - if (length(pp_remove)!=0){ - cos_results$cos_results$cos_purity$min_abs_cor <- as.matrix(cos_results$cos_results$cos_purity$min_abs_cor)[-pp_remove, -pp_remove, drop=FALSE] - cos_results$cos_results$cos_purity$median_abs_cor <- as.matrix(cos_results$cos_results$cos_purity$median_abs_cor)[-pp_remove, -pp_remove, drop=FALSE] - cos_results$cos_results$cos_purity$max_abs_cor <- as.matrix(cos_results$cos_results$cos_purity$max_abs_cor)[-pp_remove, -pp_remove, drop=FALSE] - cos_results$cos_results$cos_PPC <- cos_results$cos_results$cos_PPC[-pp_remove] + if (is.null(pvalue_cutoff)){ + message(paste("Extracting colocalization results with cos_npc_cutoff =", cos_npc_cutoff, "and npc_outcome_cutoff =", npc_outcome_cutoff, ".\n", + "For each CoS, keep the outcomes configurations that the npc_outcome >", npc_outcome_cutoff, ".")) + } else { + if (pvalue_cutoff>1 | pvalue_cutoff<0){ + warnings("Please check the pvalue cutoff in [0,1].") + return(cb_output) + } + if (npc_outcome_cutoff == 0 && cos_npc_cutoff == 0){ + message(paste("Extracting colocalization results with pvalue_cutoff =", pvalue_cutoff, ".\n", + "For each CoS, keep the outcomes configurations that pvalue of variants for the outcome <", pvalue_cutoff, ".")) + } else { + message(paste("Extracting colocalization results with pvalue_cutoff =", pvalue_cutoff, ", cos_npc_cutoff =", cos_npc_cutoff, ", and npc_outcome_cutoff =", npc_outcome_cutoff, ".\n", + "For each CoS, keep the outcomes configurations that pvalue of variants for the outcome <", pvalue_cutoff, " and npc_outcome >", npc_outcome_cutoff, ".")) + } } - } - return(cos_results) - -} - - -colocboost_get_npc_configs <- function(cb_results, npc_cutoff = 0.7, alpha = 1.5, coverage = 0.95){ - if (is.null(cb_results$cos_details)){ - warnings("No colocalization results in this region!") - return(cb_results) - } - if (npc_cutoff == 0){ - warnings("All possible colocalization events are reported regardless of their relative evidence compared to uncolocalized events (npc_cutoff = 0).") - return(cb_results) - } - message(paste("Extracting colocalization results with npc_cutoff =", npc_cutoff, ".\n", - "For each CoS, keep the outcomes configurations that the npc_outcome >", npc_cutoff, ".")) - - remove_cos <- function(cb_results, remove_idx = NULL){ + remove_cos <- function(cb_output, remove_idx = NULL){ if (length(remove_idx)==0){ - return(cb_results) + return(cb_output) } - ncos <- length(cos_details$cos_top_variables) - if (length(remove_idx) == ncos){ - cb_results$vcp <- NULL - cb_results$cos_details <- NULL - cb_results <- c(cb_results, list(vcp = NULL, cos_details = NULL)) - return(cb_results) + ncos <- length(cb_output$cos_details$cos$cos_index) + if (ncos == 0 | length(remove_idx) == ncos){ + cb_output$vcp <- NULL + cb_output$cos_details <- NULL + cb_output <- c(cb_output, list(vcp = NULL, cos_details = NULL)) + return(cb_output) } - cos_details <- cb_results$cos_details + cos_details <- cb_output$cos_details cos_details$cos_top_variables <- cos_details$cos_top_variables[-remove_idx, , drop = FALSE] cos_details$cos$cos_index <- cos_details$cos$cos_index[-remove_idx] cos_details$cos$cos_variables <- cos_details$cos$cos_variables[-remove_idx] @@ -709,40 +186,59 @@ colocboost_get_npc_configs <- function(cb_results, npc_cutoff = 0.7, alpha = 1.5 cos_details$cos_vcp <- cos_details$cos_vcp[-remove_idx] cos_details$cos_weights <- cos_details$cos_weights[-remove_idx] cos_details$cos_npc <- cos_details$cos_npc[-remove_idx] + cos_details$cos_min_npc_outcome <- cos_details$cos_min_npc_outcome[-remove_idx] cos_details$cos_purity$min_abs_cor <- as.matrix(cos_details$cos_purity$min_abs_cor)[-remove_idx, -remove_idx, drop=FALSE] cos_details$cos_purity$median_abs_cor <- as.matrix(cos_details$cos_purity$median_abs_cor)[-remove_idx, -remove_idx, drop=FALSE] cos_details$cos_purity$max_abs_cor <- as.matrix(cos_details$cos_purity$max_abs_cor)[-remove_idx, -remove_idx, drop=FALSE] vcp <- as.vector(1 - apply(1 - do.call(cbind, cos_details$cos_vcp),1,prod)) - names(vcp) <- cb_results$data_info$variables - cb_results$vcp <- vcp - cb_results$cos_details <- cos_details - return(cb_results) + names(vcp) <- cb_output$data_info$variables + cb_output$vcp <- vcp + cb_output$cos_details <- cos_details + return(cb_output) } - cos_details <- cb_results$cos_details - + cos_details <- cb_output$cos_details coloc_outcome_index <- coloc_outcome <- list() - colocset_names <- c() + colocset_names <- cos_min_npc_outcome <- c() for (i in 1:length(cos_details$cos$cos_index)){ - cos_npc_config <- cos_details$cos_normalization_evidence[[i]] + cos_npc_config <- cos_details$cos_outcomes_npc[[i]] npc_outcome <- cos_npc_config$npc_outcome - pos_pass <- which(npc_outcome>=npc_cutoff) - if (length(pos_pass) == 0){ - coloc_outcome_index[[i]] <- 0 - coloc_outcome[[i]] <- 0 - colocset_names[i] <- paste0("remove", i) - } else { - coloc_outcome_index[[i]] <- sort(cos_npc_config$outcomes_index[pos_pass]) - coloc_outcome[[i]] <- rownames(cos_npc_config)[pos_pass] - colocset_names[i] <- paste0("cos", i, ":", paste0(paste0("y", coloc_outcome_index[[i]]), collapse = "_")) - if (grepl("merged", names(cos_details$cos$cos_index)[i])) { - colocset_names[i] <- paste0(colocset_names[i], ":merged") + pos_pass <- which(npc_outcome>=npc_outcome_cutoff) + if (!is.null(pvalue_cutoff)){ + cos_tmp <- cos_details$cos$cos_index[[i]] + cos_trait <- cos_details$cos_outcomes$outcome_index[[i]] + minPV <- sapply(cos_trait, function(tmp){ + z <- cb_output$data_info$z[[tmp]][cos_tmp] + pv <- pchisq(z^2, 1, lower.tail = FALSE) + min(pv) + }) + pos_pass_pvalue <- which(minPV <= pvalue_cutoff) + if (length(pos_pass_pvalue)==0){ + pos_pass = NULL + } else { + pos_pass_pvalue <- match(cos_trait[pos_pass_pvalue], cos_npc_config$outcomes_index ) + pos_pass <- intersect( pos_pass_pvalue, pos_pass) } } + if (length(pos_pass) == 0 ){ + coloc_outcome_index[[i]] <- 0 + coloc_outcome[[i]] <- 0 + cos_min_npc_outcome[i] <- 0 + colocset_names[i] <- paste0("remove", i) + } else { + cos_min_npc_outcome[i] <- min(npc_outcome[pos_pass]) + coloc_outcome_index[[i]] <- sort(cos_npc_config$outcomes_index[pos_pass]) + coloc_outcome[[i]] <- rownames(cos_npc_config)[pos_pass] + colocset_names[i] <- paste0("cos", i, ":", paste0(paste0("y", coloc_outcome_index[[i]]), collapse = "_")) + if (grepl("merged", names(cos_details$cos$cos_index)[i])) { + colocset_names[i] <- paste0(colocset_names[i], ":merged") + } + } } - names(coloc_outcome) <- names(coloc_outcome_index) <- colocset_names + names(coloc_outcome) <- names(coloc_outcome_index) <- names(cos_min_npc_outcome) <- colocset_names cos_details$cos_outcomes <- list("outcome_index" = coloc_outcome_index, "outcome_name" = coloc_outcome) + cos_details$cos_min_npc_outcome <- cos_min_npc_outcome # - VCP cos_weights <- lapply(1:length(cos_details$cos_outcomes$outcome_index), function(idx){ @@ -759,16 +255,16 @@ colocboost_get_npc_configs <- function(cb_results, npc_cutoff = 0.7, alpha = 1.5 w[, pos, drop=FALSE] }) cos_details$cos_weights <- cos_weights - int_weight <- lapply(cos_weights, get_integrated_weight, alpha = alpha) + int_weight <- lapply(cos_weights, get_integrated_weight, weight_fudge_factor = weight_fudge_factor) names(int_weight) <- names(cos_weights) <- colocset_names vcp <- as.vector(1 - apply(1 - do.call(cbind, int_weight),1,prod)) - names(vcp) <- cb_results$data_info$variables - cb_results$vcp <- vcp + names(vcp) <- cb_output$data_info$variables + cb_output$vcp <- vcp cos_details$cos_vcp = int_weight # - resummary results - cos_re_idx <- lapply(int_weight, function(w){ unlist(get_in_csets(w, coverage = coverage))}) - cos_re_var <- lapply(cos_re_idx, function(idx){ cb_results$data_info$variables[idx] }) + cos_re_idx <- lapply(int_weight, function(w){ unlist(get_in_cos(w, coverage = coverage))}) + cos_re_var <- lapply(cos_re_idx, function(idx){ cb_output$data_info$variables[idx] }) coloc_csets <- list("cos_index" = cos_re_idx, "cos_variables" = cos_re_var) cos_details$cos <- coloc_csets @@ -778,11 +274,12 @@ colocboost_get_npc_configs <- function(cb_results, npc_cutoff = 0.7, alpha = 1.5 inw <- int_weight[[i]] if (length(unique(inw))==1){ coloc_hits <- c(coloc_hits, 0) + coloc_hits_variablenames <- c(coloc_hits_variablenames, 0) coloc_hits_names <- c(coloc_hits_names, names(int_weight)[i]) } else { pp <- which(inw == max(inw)) coloc_hits <- c(coloc_hits, pp) - coloc_hits_variablenames <- c(coloc_hits_variablenames, cb_results$data_info$variables[pp]) + coloc_hits_variablenames <- c(coloc_hits_variablenames, cb_output$data_info$variables[pp]) if (length(pp)==1){ coloc_hits_names <- c(coloc_hits_names, names(int_weight)[i]) } else { @@ -793,13 +290,17 @@ colocboost_get_npc_configs <- function(cb_results, npc_cutoff = 0.7, alpha = 1.5 coloc_hits <- data.frame("top_index" = coloc_hits, "top_variables" = coloc_hits_variablenames) rownames(coloc_hits) <- coloc_hits_names cos_details$cos_top_variables <- coloc_hits - cb_results$cos_details <- cos_details - + cb_output$cos_details <- cos_details - # remove CoS does not include outcomes pass npc_cutoff + # remove CoS does not include outcomes pass npc_outcome_cutoff remove <- grep("remove", colocset_names) - cb_results <- remove_cos(cb_results, remove_idx = remove) - cos_details <- cb_results$cos_details + cb_output <- remove_cos(cb_output, remove_idx = remove) + cos_details <- cb_output$cos_details + + # remove CoS does not pass cos_npc_cutoff + remove <- which(cos_details$cos_npc < cos_npc_cutoff) + cb_output <- remove_cos(cb_output, remove_idx = remove) + cos_details <- cb_output$cos_details # remove CoS only with one trait n_outcome <- sapply(cos_details$cos_outcomes$outcome_index, length) @@ -807,8 +308,8 @@ colocboost_get_npc_configs <- function(cb_results, npc_cutoff = 0.7, alpha = 1.5 if (length(single)==length(n_outcome)){ # - all remaining the single outcome - cb_results$cos_details <- cb_results$vcp <- NULL - cb_results <- c(cb_results, list(vcp = NULL, cos_details = NULL)) + cb_output$cos_details <- cb_output$vcp <- NULL + cb_output <- c(cb_output, list(vcp = NULL, cos_details = NULL)) } else if (length(single)!=0 & length(single)!=length(n_outcome)){ @@ -822,33 +323,578 @@ colocboost_get_npc_configs <- function(cb_results, npc_cutoff = 0.7, alpha = 1.5 "median_abs_cor" = as.matrix(cos_details$cos_purity$median_abs_cor)[single, single, drop=FALSE], "max_abs_cor" = as.matrix(cos_details$cos_purity$max_abs_cor)[single, single, drop=FALSE]), "ucos_top_variables" = cos_details$cos_top_variables[single, , drop = FALSE]) - cb_results$ucos_from_cos <- ucos_from_cos - cb_results <- remove_cos(cb_results, remove_idx = single) + cb_output$ucos_from_cos <- ucos_from_cos + cb_output <- remove_cos(cb_output, remove_idx = single) } # - refine and output - target_outcome <- NULL - target_idx <- which(cb_results$data_info$outcome_info$is_target) - if (length(target_idx)!=0){ - target_outcome <- cb_results$data_info$outcome_info$outcome_names[target_idx] - } - cb_results$cos_summary <- get_cos_summary(cb_results, target_outcome = target_outcome) + class(cb_output) <- "colocboost" + cb_output$cos_summary <- get_cos_summary(cb_output) - return(cb_results) + return(cb_output) } +#' @rdname get_ucos_summary +#' +#' @title Get fine-mapping summary table from a ColocBoost output with only one single trait fine-mapping analysis. +#' +#' @description `get_ucos_summary` get the fine-mapping summary table +#' +#' @param cb_output Output object from `colocboost` analysis +#' @param outcome_names Optional vector of names of outcomes, which has the same order as Y in the original analysis. +#' @param gene_name Optional character string. When provided, adds a column with this gene name to the output table for easier filtering in downstream analyses. +#' +#' @return A summary table for fine-mapped events with the following columns: +#' \item{outcomes}{Outcomes analyzed } +#' \item{ucos_id}{Unique identifier for fine-mapped confidence sets } +#' \item{purity}{Minimum absolute correlation of variables with in fine-mapped confidence sets } +#' \item{top_variable}{The variable with highest posterior inclusion probability (PIP) } +#' \item{top_variable_pip}{Posterior inclusion probability (PIP) for the top variable} +#' \item{n_variables}{Number of variables in colocalization confidence set (CoS)} +#' \item{ucos_index}{Indices of fine-mapped variables} +#' \item{ucos_variables}{List of fine-mapped variables} +#' \item{ucos_variables_pip}{Posterior inclusion probability (PIP) for all fine-mapped variables} +#' +#' @examples +#' get_ucos_summary(cb_output) +#' +#' @keywords cb_get_functions +#' @noRd +get_ucos_summary <- function(cb_output, outcome_names = NULL, gene_name = NULL){ + + if (!inherits(cb_output, "colocboost")){ + stop("Input must from colocboost object!")} + + specific_cs <- cb_output$ucos_details + if (length(specific_cs$ucos$ucos_index) != 0){ + + cs_outcome <- cb_output$data_info$outcome_info$outcome_names + if (!is.null(outcome_names)){ cs_outcome <- outcome_names } + pip <- as.numeric(cb_output$pip) + + summary_table <- matrix(NA, nrow = length(specific_cs$ucos$ucos_index), ncol = 9) + colnames(summary_table) <- c("outcomes", "ucos_id", "purity", + "top_variable", "top_variable_pip", "n_variables", "ucos_index", + "ucos_variables", "ucos_variables_pip") + summary_table <- as.data.frame(summary_table) + summary_table[,1] <- cs_outcome[unlist(specific_cs$ucos_outcomes$outcome_index)] + summary_table[,2] <- names(specific_cs$ucos$ucos_index) + summary_table[,3] <- as.numeric(diag(as.matrix(specific_cs$ucos_purity$min_abs_cor))) + summary_table[,4] <- unlist(sapply(specific_cs$ucos$ucos_variables, function(tmp) tmp[1])) + summary_table[,5] <- sapply(specific_cs$ucos$ucos_index, function(tmp) max(pip[tmp])) + summary_table[,6] <- as.numeric(sapply(specific_cs$ucos$ucos_index, length)) + summary_table[,7] <- unlist(sapply(specific_cs$ucos$ucos_index, function(tmp) paste0(tmp, collapse = "; "))) + summary_table[,8] <- unlist(sapply(specific_cs$ucos$ucos_variables, function(tmp) paste0(tmp, collapse = "; "))) + summary_table[,9] <- unlist(sapply(specific_cs$ucos$ucos_index, function(tmp) paste0(pip[tmp], collapse = "; "))) + if (!is.null(gene_name)){ summary_table$gene_name <- gene_name } + + } else { + summary_table <- NULL + } + return(summary_table) + +} +#' Extract CoS simply change the coverage without checking purity +#' @keywords cb_get_functions +#' @noRd +get_cos_different_coverage <- function(cb_output, coverage = 0.95){ + + cos_vcp <- cb_output$cos_details$cos_vcp + cos_diff_coverage <- lapply(cos_vcp, function(w){ + temp <- order(w, decreasing=T) + temp[1:min(which(cumsum(w[temp]) > coverage))] # 95% + }) + return(cos_diff_coverage) +} +#' @title Set of internal functions to re-organize ColocBoost output format +#' +#' @description +#' The `colocboost_output_reorganization` functions access basic properties inferences from a fitted ColocBoost model. This documentation serves as a summary for all related post-inference functions. +#' +#' @details +#' The following functions are included in this set: +#' `get_data_info` get formatted \code{data_info} in ColocBoost output +#' `get_cos_details` get formatted \code{cos_details} in ColocBoost output +#' `get_model_info` get formatted \code{model_info} in ColocBoost output +#' `get_full_output` get formatted additional information in ColocBoost output when \code{output_level!=1} +#' +#' These functions are not exported individually and are accessed via `colocboost_output_reorganization`. +#' +#' @rdname colocboost_output_reorganization +#' @keywords cb_reorganization +#' @noRd +colocboost_output_reorganization <- function() { + message("This function re-formats colocboost output as internal used. See details for more information.") +} +#' Get formatted \code{data_info} in ColocBoost output +#' @noRd +#' @keywords cb_reorganization +get_data_info <- function(cb_obj){ + + ## - analysis data information + n_outcome <- cb_obj$cb_model_para$L + n_variables <- cb_obj$cb_model_para$P + analysis_outcome <- cb_obj$cb_model_para$outcome_names + variables <- cb_obj$cb_data$variable.names + target_outcome <- NULL + is_target <- rep(FALSE, n_outcome) + if (!is.null(cb_obj$cb_model_para$target_outcome_idx)){ + target_outcome <- analysis_outcome[cb_obj$cb_model_para$target_outcome_idx] + is_target[cb_obj$cb_model_para$target_outcome_idx] <- TRUE + } + is_sumstat <- grepl("sumstat_outcome", names(cb_obj$cb_data$data)) + outcome_info <- data.frame( + "outcome_names" = analysis_outcome, "sample_size" = cb_obj$cb_model_para$N, + "is_sumstats" = is_sumstat, "is_target" = is_target ) + rownames(outcome_info) <- paste0("y", 1:n_outcome) + + ## - marginal associations + z_scores <- lapply(cb_obj$cb_model, function(cb){ as.numeric(cb$z_univariate) }) + betas <- lapply(cb_obj$cb_model, function(cb){ as.numeric(cb$beta) }) + names(z_scores) <- names(betas) <- analysis_outcome + ## - output data info + data.info <- list("n_outcomes" = n_outcome, + "n_variables" = n_variables, + "outcome_info" = outcome_info, + "variables" = variables, + "coef" = betas, + "z" = z_scores) + return(data.info) + +} +#' Get formatted \code{cos_details} in ColocBoost output +#' @noRd +#' @keywords cb_reorganization +get_cos_details <- function(cb_obj, coloc_out, data_info = NULL){ + + if (is.null(data_info)) + data_info <- get_data_info(cb_obj) + + + ### ----- Define the colocalization results + coloc_sets <- coloc_out$cos + if (length(coloc_sets)!=0){ + + # - colocalization outcome configurations + tmp <- get_cos_evidence(cb_obj, coloc_out, data_info) + normalization_evidence <- tmp$normalization_evidence + npc <- tmp$npc + cos_min_npc_outcome <- sapply( normalization_evidence, function(cp) min(cp$npc_outcome)) + + # - colocalized outcomes + analysis_outcome <- cb_obj$cb_model_para$outcome_names + coloc_outcome_index <- coloc_outcome <- list() + colocset_names <- c() + for (i in 1:length(coloc_out$cos)){ + coloc_outcome_index[[i]] <- coloc_out$coloc_outcomes[[i]] + coloc_outcome[[i]] <- analysis_outcome[coloc_outcome_index[[i]]] + colocset_names[i] <- paste0("cos", i, ":", paste0(paste0("y", coloc_outcome_index[[i]]), collapse = "_")) + if (grepl("merged", names(coloc_sets)[i])) { + colocset_names[i] <- paste0(colocset_names[i], ":merged") + } + } + names(coloc_outcome) <- names(coloc_outcome_index) <- colocset_names + names(npc) <- names(normalization_evidence) <- names(cos_min_npc_outcome) <- colocset_names + coloc_outcomes <- list("outcome_index" = coloc_outcome_index, "outcome_name" = coloc_outcome) + + # - colocalized sets for variables + coloc_csets_variableidx <- coloc_out$cos + coloc_csets_variablenames <- lapply(coloc_csets_variableidx, function(coloc_tmp){ cb_obj$cb_model_para$variables[coloc_tmp] }) + coloc_csets_variableidx <- lapply(coloc_csets_variablenames, function(variable) match(variable, data_info$variables)) + names(coloc_csets_variableidx) <- names(coloc_csets_variablenames) <- colocset_names + coloc_csets_original <- list("cos_index" = coloc_csets_variableidx, "cos_variables" = coloc_csets_variablenames) + + # - colocalized set cs_change + cs_change <- coloc_out$cs_change + rownames(cs_change) <- colocset_names + colnames(cs_change) <- analysis_outcome + + # - VCP + cos_weights <- lapply(coloc_out$avWeight, function(w){ + pos <- match(data_info$variables, cb_obj$cb_model_para$variables) + return(w[pos, , drop = FALSE]) + }) + int_weight <- lapply(cos_weights, get_integrated_weight, weight_fudge_factor = cb_obj$cb_model_para$weight_fudge_factor) + names(int_weight) <- names(cos_weights) <- colocset_names + vcp <- as.vector(1 - apply(1 - do.call(cbind, int_weight),1,prod)) + names(vcp) <- data_info$variables + + # - resummary results + cos_re_idx <- lapply(int_weight, function(w){ unlist(get_in_cos(w, coverage = cb_obj$cb_model_para$coverage))}) + cos_re_var <- lapply(cos_re_idx, function(idx){ data_info$variables[idx] }) + coloc_csets <- list("cos_index" = cos_re_idx, "cos_variables" = cos_re_var) + + # - hits variables in each csets + coloc_hits <- coloc_hits_variablenames <- coloc_hits_names <- c() + for (i in 1:length(int_weight)){ + inw <- int_weight[[i]] + pp <- which(inw == max(inw)) + coloc_hits <- c(coloc_hits, pp) + coloc_hits_variablenames <- c(coloc_hits_variablenames, data_info$variables[pp]) + if (length(pp)==1){ + coloc_hits_names <- c(coloc_hits_names, names(int_weight)[i]) + } else { + coloc_hits_names <- c(coloc_hits_names, paste0(names(int_weight)[i], ".", 1:length(pp))) + } + } + coloc_hits <- data.frame("top_index" = coloc_hits, "top_variables" = coloc_hits_variablenames) + rownames(coloc_hits) <- coloc_hits_names + + # - purity + ncos <- length(coloc_csets$cos_index) + if (ncos >= 2){ + empty_matrix <- matrix(NA, ncos, ncos) + colnames(empty_matrix) <- rownames(empty_matrix) <- colocset_names + csets_purity <- lapply(1:3, function(ii){ + diag(empty_matrix) <- coloc_out$purity[,ii] + return(empty_matrix) + }) + for (i in 1:(ncos-1)){ + for (j in (i+1):ncos){ + cset1 <- coloc_csets$cos_index[[i]] + cset2 <- coloc_csets$cos_index[[j]] + y.i <- coloc_outcomes$outcome_index[[i]] + y.j <- coloc_outcomes$outcome_index[[j]] + yy <- unique(y.i, y.j) + res <- list() + flag <- 1 + for (ii in yy){ + X_dict <- cb_obj$cb_data$dict[ii] + res[[flag]] <- get_between_purity(cset1, cset2, X = cb_obj$cb_data$data[[X_dict]]$X, + Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, + N = cb_obj$cb_data$data[[ii]]$N, + miss_idx = cb_obj$cb_data$data[[ii]]$variable_miss, + P = cb_obj$cb_model_para$P) + flag <- flag + 1 + } + res <- Reduce(pmax, res) + csets_purity <- lapply(1:3, function(ii){ + csets_purity[[ii]][i,j] <- csets_purity[[ii]][j,i] <- res[ii] + return(csets_purity[[ii]]) + }) + } + } + names(csets_purity) <- c("min_abs_cor", "max_abs_cor", "median_abs_cor") + + } else { + csets_purity <- lapply(1:length(coloc_out$purity), function(i){ + tmp <- as.matrix(coloc_out$purity[i]) + rownames(tmp) <- colnames(tmp) <- colocset_names + tmp + }) + names(csets_purity) <- c("min_abs_cor", "max_abs_cor", "median_abs_cor") + } + + # - save coloc_results + coloc_results <- list("cos" = coloc_csets, + "cos_outcomes" = coloc_outcomes, + "cos_vcp" = int_weight, + "cos_outcomes_npc" = normalization_evidence, + "cos_npc" = npc, + "cos_min_npc_outcome" = cos_min_npc_outcome, + "cos_purity" = csets_purity, + "cos_top_variables" = coloc_hits, + "cos_weights" = cos_weights) + + + # - missing variable and warning message + missing_variables_idx <- Reduce(union, lapply(cb_obj$cb_data$data,function(cb) cb$variable_miss )) + missing_variables <- cb_obj$cb_model_para$variables[missing_variables_idx] + cos_missing_variables_idx <- lapply(coloc_csets_original$cos_variables, function(variable){ + missing <- intersect(variable, missing_variables) + if (length(missing)!=0){ + match(missing, data_info$variables) + } else { NULL } + }) + cos_missing_variables <- lapply(cos_missing_variables_idx, function(variable){if(!is.null(variable)){data_info$variables[variable]} else {NULL}}) + warning_needed <- any(!sapply(cos_missing_variables, is.null)) + if (warning_needed){ + is_missing <- which(!sapply(cos_missing_variables, is.null)) + cos_missing_variables_idx <- cos_missing_variables_idx[is_missing] + cos_missing_variables <- cos_missing_variables[is_missing] + cos_missing_vcp <- lapply(cos_missing_variables_idx, function(idx){ vcp[idx] }) + warning_message <- paste("CoS", paste(names(cos_missing_variables_idx), collapse = ","), + "contains missing variables in at least one outcome.", + "The missing variables will cause the ~0 VCP scores.") + cos_warnings <- list("cos_missing_info" = list("index" = cos_missing_variables_idx, + "variables" = cos_missing_variables, + "vcp" = cos_missing_vcp), + "warning_message" = warning_message) + coloc_results$cos_warnings <- cos_warnings + } + + } else { + coloc_results <- NULL + vcp <- NULL + } + return(list("cos_results"=coloc_results, "vcp"=vcp)) + +} +#' Get formatted \code{model_info} in ColocBoost output +#' @noRd +#' @keywords cb_reorganization +get_model_info <- function(cb_obj, outcome_names = NULL){ + + if (is.null(outcome_names)){ + data_info <- get_data_info(cb_obj) + outcome_names <- data_info$outcome_info$outcome_names + } + + profile_loglik <- cb_obj$cb_model_para$profile_loglike + n_updates <- cb_obj$cb_model_para$num_updates + model_coveraged <- cb_obj$cb_model_para$coveraged + jk_update <- cb_obj$cb_model_para$real_update_jk + outcome_proximity_obj <- lapply(cb_obj$cb_model, function(cb) cb$obj_path) + outcome_coupled_obj <- lapply(cb_obj$cb_model, function(cb) cb$obj_single) + outcome_profile_loglik <- lapply(cb_obj$cb_model, function(cb) cb$profile_loglike_each) + names(outcome_proximity_obj) <- names(outcome_coupled_obj) <- + names(outcome_profile_loglik) <- outcome_names + ll <- list("model_coveraged" = model_coveraged, + "n_updates" = n_updates, + "profile_loglik" = profile_loglik, + "outcome_profile_loglik" = outcome_profile_loglik, + "outcome_proximity_obj" = outcome_proximity_obj, + "outcome_coupled_obj" = outcome_coupled_obj, + "jk_update" = jk_update) + return(ll) + +} - - +#' Get formatted additional information in ColocBoost output when \code{output_level!=1} +#' @noRd +#' @keywords cb_reorganization +get_full_output <- function(cb_obj, past_out = NULL, variables = NULL, cb_output = NULL, weaker_ucos = FALSE){ + + cb_model <- cb_obj$cb_model + cb_model_para <- cb_obj$cb_model_para + + ## - obtain the order of variables based on the variables names if it has position information + if (is.null(variables)){ + variables <- cb_obj$cb_model_para$variables + if (all(grepl("chr", variables))){ + # - if variables_name has position informtaion, re-order all_info based on positions + position <- as.numeric(gsub(".*:(\\d+).*", "\\1", variables)) + # position <- as.numeric(sapply(variables_name, function(tmp) strsplit(tmp, ":")[[1]][2])) + ordered <- order(position, decreasing = FALSE) + } else { + ordered <- 1:length(variables) + } + } else { + ordered <- match(variables, cb_obj$cb_model_para$variables) + } + + ## - reorder all output + # - cb_model + tmp <- lapply(cb_model, function(cb){ + cb$beta <- cb$beta[ordered] + cb$weights_path <- cb$weights_path[, ordered] + cb$change_loglike <- cb$change_loglike[ordered] + cb$correlation <- as.numeric(cb$correlation[ordered]) + cb$z <- as.numeric(cb$z[ordered]) + cb$ld_jk <- cb$ld_jk[,ordered] + cb$z_univariate <- as.numeric(cb$z_univariate[ordered]) + cb$beta_hat <- as.numeric(cb$beta_hat[ordered]) + cb$multi_correction <- as.numeric(cb$multi_correction[ordered]) + cb$multi_correction_univariate <- as.numeric(cb$multi_correction_univariate[ordered]) + return(cb) + }) + cb_model <- tmp + + # - sets + if (!is.null(past_out)){ + out_ucos <- past_out$ucos + # - single sets + if (!is.null(out_ucos$ucos_each)){ + + out_ucos$ucos_each <- lapply(out_ucos$ucos_each, function(cs){ + match(cb_model_para$variables[cs], variables) + }) + out_ucos$avW_ucos_each <- out_ucos$avW_ucos_each[ordered,,drop=FALSE] + + # - re-orginize specific results + analysis_outcome <- cb_obj$cb_model_para$outcome_names + specific_outcome_index <- specific_outcome <- list() + specific_cs_names <- c() + for (i in 1:length(out_ucos$ucos_each)){ + cc <- out_ucos$avW_ucos_each[,i,drop=FALSE] + tmp_names <- colnames(cc) + specific_outcome_index[[i]] <- as.numeric(gsub(".*Y([0-9]+).*", "\\1", tmp_names)) + specific_outcome[[i]] <- analysis_outcome[specific_outcome_index[[i]]] + specific_cs_names[i] <- paste0("ucos", i, ":y", specific_outcome_index[[i]]) + } + names(specific_outcome) <- names(specific_outcome_index) <- specific_cs_names + specific_outcomes <- list("outcome_index" = specific_outcome_index, "outcome_name" = specific_outcome) + + # - specific sets for variables + specific_cs_variableidx <- out_ucos$ucos_each + specific_cs_variablenames <- lapply(specific_cs_variableidx, function(specific_tmp){ cb_obj$cb_model_para$variables[specific_tmp] }) + specific_cs_variableidx <- lapply(specific_cs_variablenames, function(variable) match(variable, variables)) + names(specific_cs_variableidx) <- names(specific_cs_variablenames) <- specific_cs_names + specific_css <- list("ucos_index" = specific_cs_variableidx, "ucos_variables" = specific_cs_variablenames) + + # - specific set cs_change + cs_change <- out_ucos$change_obj_each + rownames(cs_change) <- specific_cs_names + colnames(cs_change) <- analysis_outcome + index_change <- as.data.frame(which(cs_change != 0, arr.ind = TRUE)) + change_outcomes <- analysis_outcome[index_change$col] + change_values <- diag(as.matrix(cs_change[index_change$row, index_change$col])) + cs_change <- data.frame("ucos_outcome" = change_outcomes, "ucos_delta" = change_values) + + # - filter weak ucos + check_null_max <- sapply(cb_model, function(cb) cb$check_null_max) + remove_weak <- sapply(1:nrow(cs_change), function(ic){ + outcome_tmp <- cs_change$ucos_outcome[ic] + delta_tmp <- cs_change$ucos_delta[ic] + pp <- which(cb_obj$cb_model_para$outcome_names == outcome_tmp) + check_tmp <- check_null_max[pp] + delta_tmp >= check_tmp + }) + keep_ucos <- which(remove_weak) + if (length(keep_ucos)==0){ + specific_results <- NULL + } else { + specific_outcomes$outcome_index <- specific_outcomes$outcome_index[keep_ucos] + specific_outcomes$outcome_name <- specific_outcomes$outcome_name[keep_ucos] + specific_css$ucos_index <- specific_css$ucos_index[keep_ucos] + specific_css$ucos_variables <- specific_css$ucos_variables[keep_ucos] + cs_change <- cs_change[keep_ucos, , drop = FALSE] + out_ucos$avW_ucos_each <- out_ucos$avW_ucos_each[, keep_ucos, drop = FALSE] + specific_cs_names <- specific_cs_names[keep_ucos] + out_ucos$purity_each <- out_ucos$purity_each[keep_ucos, , drop = FALSE] + + # - ucos_weight + specific_w <- lapply(1:ncol(out_ucos$avW_ucos_each), function(ii) out_ucos$avW_ucos_each[,ii,drop=FALSE]) + names(specific_w) <- specific_cs_names + + # - hits variables in each csets + cs_hits <- sapply(1:length(specific_w), function(jj){ inw=specific_w[[jj]]; sample(which(inw == max(inw)),1) }) + cs_hits_variablenames <- sapply(cs_hits, function(ch) variables[ch] ) + specific_cs_hits <- data.frame("top_index" = cs_hits, "top_variables" = cs_hits_variablenames) # save + rownames(specific_cs_hits) <- specific_cs_names + + # - purity + nucos <- length(specific_css$ucos_index) + if (nucos >= 2){ + empty_matrix <- matrix(NA, nucos, nucos) + colnames(empty_matrix) <- rownames(empty_matrix) <- specific_cs_names + specific_cs_purity <- lapply(1:3, function(ii){ + diag(empty_matrix) <- out_ucos$purity_each[,ii] + return(empty_matrix) + }) + for (i in 1:(nucos-1)){ + for (j in (i+1):nucos){ + cset1 <- specific_css$ucos_index[[i]] + cset2 <- specific_css$ucos_index[[j]] + y.i <- specific_outcomes$outcome_index[[i]] + y.j <- specific_outcomes$outcome_index[[j]] + yy <- unique(y.i, y.j) + res <- list() + flag <- 1 + for (ii in yy){ + X_dict <- cb_obj$cb_data$dict[ii] + res[[flag]] <- get_between_purity(cset1, cset2, X = cb_obj$cb_data$data[[X_dict]]$X, + Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, + N = cb_obj$cb_data$data[[ii]]$N, + miss_idx = cb_obj$cb_data$data[[ii]]$variable_miss, + P = cb_obj$cb_model_para$P) + flag <- flag + 1 + } + res <- Reduce(pmax, res) + specific_cs_purity <- lapply(1:3, function(ii){ + specific_cs_purity[[ii]][i,j] <- specific_cs_purity[[ii]][j,i] <- res[ii] + return(specific_cs_purity[[ii]]) + }) + } + } + names(specific_cs_purity) <- c("min_abs_cor", "max_abs_cor", "median_abs_cor") + + } else { + specific_cs_purity <- out_ucos$purity_each + rownames(specific_cs_purity) <- specific_cs_names + } + + # - cos&ucos purity + cos <- cb_output$cos_details$cos$cos_index + ncos <- length(cos) + if (ncos!=0){ + empty_matrix <- matrix(NA, ncos, nucos) + colnames(empty_matrix) <- specific_cs_names + rownames(empty_matrix) <- names(cos) + cos_ucos_purity <- lapply(1:3, function(ii) empty_matrix ) + for (i in 1:ncos){ + for (j in 1:nucos){ + cset1 <- cos[[i]] + cset2 <- specific_css$ucos_index[[j]] + y.i <- cb_output$cos_details$cos_outcomes$outcome_index[[i]] + y.j <- specific_outcomes$outcome_index[[j]] + yy <- unique(y.i, y.j) + res <- list() + flag <- 1 + for (ii in yy){ + X_dict <- cb_obj$cb_data$dict[ii] + res[[flag]] <- get_between_purity(cset1, cset2, X = cb_obj$cb_data$data[[X_dict]]$X, + Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, + N = cb_obj$cb_data$data[[ii]]$N, + miss_idx = cb_obj$cb_data$data[[ii]]$variable_miss, + P = cb_obj$cb_model_para$P) + flag <- flag + 1 + } + res <- Reduce(pmax, res) + cos_ucos_purity <- lapply(1:3, function(ii){ + cos_ucos_purity[[ii]][i,j] <- res[ii] + return(cos_ucos_purity[[ii]]) + }) + } + } + names(cos_ucos_purity) <- c("min_abs_cor", "max_abs_cor", "median_abs_cor") + } else { + cos_ucos_purity <- NULL + } + + + # - save coloc_results + specific_results <- list("ucos" = specific_css, + "ucos_outcomes" = specific_outcomes, + "ucos_weight" = specific_w, + "ucos_top_variables" = specific_cs_hits, + "ucos_purity" = specific_cs_purity, + "cos_ucos_purity" = cos_ucos_purity, + "ucos_outcomes_delta" = cs_change) + } + + } else { + specific_results <- NULL + } + + # - cb_model_para + cb_model_para$N <- as.numeric(unlist(cb_model_para$N)) + cb_model_para$variables <- variables + + ll <- list("ucos_details" = specific_results, + "cb_model" = cb_model, + "cb_model_para" = cb_model_para) + } else { + # - cb_model_para + cb_model_para$N <- as.numeric(unlist(cb_model_para$N)) + cb_model_para$variables <- variables + ll <- list("ucos_detials" = NULL, + "cb_model" = cb_model, + "cb_model_para" = cb_model_para) + } + + return(ll) + +} diff --git a/R/colocboost_plot.R b/R/colocboost_plot.R index 740db9e..645d1b3 100644 --- a/R/colocboost_plot.R +++ b/R/colocboost_plot.R @@ -1,21 +1,65 @@ +#' @rdname colocboost_plot +#' +#' @title Plot visualization plot from a ColocBoost output. +#' +#' @description `colocboost_plot` generates visualization plots for colocalization events from a ColocBoost analysis. +#' +#' @param Output object from `colocboost` analysis +#' @param y Specifies the y-axis values, default is "log10p" for -log10 transformed marginal association p-values. +#' @param pos Optional plotting range of x-axis to zoom in to a specific region. +#' @param plot_target_only Logical, if TRUE only plots colocalization with target outcome, default is FALSE. +#' @param plot_cos_idx Optional indices of CoS to plot +#' @param outcome_idx Optional indices of outcomes to include in the plot. \code{outcome_idx=NULL} to plot only the outcomes having colocalization. +#' @param points_color Background color for non-colocalized variables, default is "grey80". +#' @param cos_color Optional custom colors for CoS. +#' @param add_vertical Logical, if TRUE adds vertical lines at specified positions, default is FALSE +#' @param add_vertical_idx Optional indices for vertical lines. +#' @param outcome_names Optional vector of outcomes names for the subtitle of each figure. \code{outcome_names=NULL} for the outcome name shown in \code{data_info}. +#' @param plot_cols Number of columns in the plot grid, default is 2. If you have many colocalization. please consider increasing this. +#' @param variant_coord Logical, if TRUE uses variant coordinates on x-axis, default is FALSE. This is required the variable names including position information. +#' @param show_hits Logical, if TRUE shows top variables for each CoS, default is FALSE +#' @param show_cos_to_uncoloc Logical, if TRUE shows colocalization to uncolocalized outcomes to diagnose, default is FALSE +#' @param show_cos_to_uncoloc_idx Optional indices for showing CoS to all uncolocalized outcomes +#' @param show_cos_to_uncoloc_outcome Optional outcomes for showing CoS to uncolocalized outcomes +#' @param gene_name Optional gene name to display in plot title +#' @param ylim_each Logical, if TRUE uses separate y-axis limits for each plot, default is TRUE +#' @param outcome_legend_pos Position for outcome legend, default is "top" +#' @param outcome_legend_size Size for outcome legend text, default is 1.2 +#' @param cos_legend_pos Position for colocalization set legend, default is "bottomleft" +#' @param show_variable Logical, if TRUE displays variant IDs, default is FALSE +#' @param lab_style Vector of two numbers for label style (size, boldness), default is c(2, 1) +#' @param axis_style Vector of two numbers for axis style (size, boldness), default is c(2, 1) +#' @param title_style Vector of two numbers for title style (size, boldness), default is c(2.5, 2) +#' @param ... Additional parameters passed to `plot` functions +#' +#' @return Visualization plot for each colcoalization event. +#' #' @importFrom utils head tail #' @importFrom graphics abline axis legend mtext par points text #' @importFrom grDevices adjustcolor +#' +#' @examples +#' colocboost_plot(cb_output) +#' +#' @keywords cb_plot +#' @export colocboost_plot <- function(cb_output, y = "log10p", - gene_name = NULL, + pos = NULL, + plot_target_only = FALSE, + plot_cos_idx = NULL, outcome_idx = NULL, + points_color = "grey80", + cos_color = NULL, + add_vertical = FALSE, + add_vertical_idx = NULL, outcome_names = NULL, plot_cols = 2, - pos = NULL, - plot_cos_idx = NULL, - plot_target_only = FALSE, variant_coord = FALSE, - show_coloc = TRUE, show_hits = FALSE, show_cos_to_uncoloc = FALSE, show_cos_to_uncoloc_idx = NULL, show_cos_to_uncoloc_outcome = NULL, - points_color = "grey90", cos_color = NULL, + gene_name = NULL, ylim_each = TRUE, outcome_legend_pos = "top", outcome_legend_size = 1.2, @@ -24,8 +68,6 @@ colocboost_plot <- function(cb_output, y = "log10p", lab_style = c(2, 1), axis_style = c(2, 1), title_style = c(2.5, 2), - add_vertical = FALSE, add_vertical_idx = NULL, - add_genetrack = FALSE, ...){ @@ -110,7 +152,7 @@ colocboost_plot <- function(cb_output, y = "log10p", } for (iy in outcome_idx){ args$y = y[[iy]] - args$ylim = c(0, cb_plot_init$ymax[iy]) + args$ylim = c(cb_plot_init$ymin[iy], cb_plot_init$ymax[iy]) if (!is.null(cb_plot_init$xtext)){ args$xaxt = "n" do.call(plot, args) @@ -213,9 +255,9 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL, # extract results from colocboost analysis_outcome <- cb_output$data_info$outcome_info$outcome_names - target_idx <- which(cb_output$data_info$outcome_info$is_target) - if ( length(target_idx)!=0 ){ - target_outcome <- analysis_outcome[target_idx] + target_outcome_idx <- which(cb_output$data_info$outcome_info$is_target) + if ( length(target_outcome_idx)!=0 ){ + target_outcome <- analysis_outcome[target_outcome_idx] } else { target_outcome <- NULL } # check if target cos target_cos <- cb_output$cos_summary$cos_id[cb_output$cos_summary$target_outcome!=FALSE] @@ -223,6 +265,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL, # extract z-scores variables <- cb_output$data_info$variables Z <- cb_output$data_info$z + coef <- cb_output$data_info$coef # if finemapping if (cb_output$data_info$n_outcomes==1){ @@ -305,6 +348,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL, "x" = x, "Zscores" = Z, "vcp" = vcp, + "coef" = coef, "cos" = coloc_cos, "cos_hits" = coloc_hits, "coloc_index" = coloc_index) @@ -375,7 +419,7 @@ get_input_plot <- function(cb_output, plot_cos_idx = NULL, #' @importFrom stats pnorm plot_initial <- function(cb_plot_input, y = "log10p", - points_color = "grey90", cos_color = NULL, + points_color = "grey80", cos_color = NULL, ylim_each = TRUE, gene_name = NULL, outcome_legend_size = 1.5, outcome_legend_pos = "right", @@ -424,6 +468,9 @@ plot_initial <- function(cb_plot_input, y = "log10p", ylab = "VCP" if (length(cb_plot_input$outcomes)==1){ ylab = "PIP" } args$ylim <- c(0,1) + } else if (y == "coef"){ + plot_data <- cb_plot_input$coef + ylab = "Coefficients" } else { stop("Invalid y value! Choose from 'z' or 'z_original'") } @@ -448,7 +495,13 @@ plot_initial <- function(cb_plot_input, y = "log10p", args$axis_face <- axis_style[2] # - set ylim for each subfigure - if (exists("ylim", args)) ymax = rep(args$ylim[2],length(args$y)) else ymax = NULL + if (exists("ylim", args)){ + ymax = rep(args$ylim[2],length(args$y)) + ymin = rep(args$ylim[1],length(args$y)) + } else { + ymax = NULL + ymin = rep(0,length(args$y)) + } if (ylim_each & is.null(ymax)) { ymax <- sapply(plot_data, function(p){ valid_values <- p[is.finite(p)] @@ -459,8 +512,12 @@ plot_initial <- function(cb_plot_input, y = "log10p", } return(ymax) }) + if (y == "coef"){ + ymin <- sapply(plot_data, function(p) min(p)*1.05) + } } args$ymax = ymax + args$ymin = ymin # - set legend text position and format args$outcome_legend_pos <- outcome_legend_pos diff --git a/R/colocboost_update.R b/R/colocboost_update.R index efda4ce..706cfe7 100644 --- a/R/colocboost_update.R +++ b/R/colocboost_update.R @@ -10,16 +10,16 @@ #' @export colocboost_update <- function(cb_model, cb_model_para, cb_data, tau = 0.01, - decayrate = 1, - func_prior = "z2z", + learning_rate_decay = 1, + func_simplex = "z2z", lambda = 0.5, - lambda_target = 1, - LD_obj = FALSE, - dynamic_step = TRUE){ + lambda_target_outcome = 1, + LD_free = FALSE, + dynamic_learning_rate = TRUE){ # - clear which outcome need to be updated at which jk pos.update <- which(cb_model_para$update_temp$update_status != 0) - target_idx <- cb_model_para$target_idx + target_outcome_idx <- cb_model_para$target_outcome_idx for (i in pos.update){ @@ -47,14 +47,14 @@ colocboost_update <- function(cb_model, cb_model_para, cb_data, ld_feature <- sqrt(abs(ld_jk)) # - calculate delta - if (is.null(target_idx)){ + if (is.null(target_outcome_idx)){ lambda_outcome <- lambda } else { - lambda_outcome <- ifelse(i==target_idx, lambda_target, lambda) + lambda_outcome <- ifelse(i==target_outcome_idx, lambda_target_outcome, lambda) } delta <- boost_KL_delta(z = cb_model[[i]]$z, ld_feature = ld_feature, adj_dep = adj_dep, - func_prior = func_prior, lambda = lambda_outcome) + func_simplex = func_simplex, lambda = lambda_outcome) x_tmp <- cb_data$data[[X_dict]]$X scaling_factor <- if (!is.null(cb_data$data[[i]]$N)) (cb_data$data[[i]]$N - 1) else 1 @@ -63,7 +63,7 @@ colocboost_update <- function(cb_model, cb_model_para, cb_data, } else { cb_model[[i]]$res / scaling_factor } - obj_ld <- if (LD_obj) ld_feature else rep(1, length(ld_feature)) + obj_ld <- if (LD_free) ld_feature else rep(1, length(ld_feature)) if (length(cb_data$data[[i]]$variable_miss)!=0){ obj_ld[cb_data$data[[i]]$variable_miss] <- 0 } @@ -84,14 +84,14 @@ colocboost_update <- function(cb_model, cb_model_para, cb_data, ########## BEGIN: MAIN UPDATE ###################### # - Gradient ascent on beta beta_grad <- weights * sign(cb_model[[i]]$correlation) - if (dynamic_step){ + if (dynamic_learning_rate){ if (tail(cb_model[[i]]$obj_path, n=1) > 0.5){ - step1 = max(0.5 * (1 / ( 1 + decayrate*(length(cb_model[[i]]$obj_path)-1) )), cb_model[[i]]$step) + step1 = max(0.5 * (1 / ( 1 + learning_rate_decay*(length(cb_model[[i]]$obj_path)-1) )), cb_model[[i]]$learning_rate_init) } else { - step1 = cb_model[[i]]$step + step1 = cb_model[[i]]$learning_rate_init } } else { - step1 = cb_model[[i]]$step + step1 = cb_model[[i]]$learning_rate_init } cb_model[[i]]$beta <- cb_model[[i]]$beta + step1 * beta_grad @@ -149,32 +149,32 @@ get_LD_jk <- function(jk1, X = NULL, XtX = NULL, N = NULL, remain_idx = NULL, P boost_KL_delta <- function(z, ld_feature, adj_dep, - func_prior = "LD_z2z", + func_simplex = "LD_z2z", lambda = 0.5){ # if (!is.null(n)){ z <- z * sqrt( (n-1)/(z^2+n-2) ) } - if (func_prior == "Range_Z"){ + if (func_simplex == "Range_Z"){ z2z <- lambda*0.5*z^2+(1-lambda)*abs(z) z2z <- (z2z - min(z2z)) / (max(z2z) - min(z2z)) * 5 z2z <- adj_dep*ld_feature*z2z delta <- exp(z2z - max(z2z)) - } else if (func_prior == "LD_z2z"){ + } else if (func_simplex == "LD_z2z"){ z2z <- lambda*0.5*z^2+(1-lambda)*abs(z) z2z <- adj_dep*ld_feature*z2z delta <- exp(z2z - max(z2z)) - } else if (func_prior == "only_z2z"){ + } else if (func_simplex == "only_z2z"){ z2z <- lambda*0.5*z^2+(1-lambda)*abs(z) z2z <- adj_dep*z2z delta <- exp(z2z - max(z2z)) - } else if (func_prior == "entropy"){ + } else if (func_simplex == "entropy"){ delta <- rep(1, length(z)) @@ -187,11 +187,11 @@ boost_KL_delta <- function(z, ld_feature, adj_dep, boost_check_stop <- function(cb_model, cb_model_para, pos_stop, - multicorrection_max = 1){ + multi_test_max = 1){ # - check the iteration for the stop outcome (pos_stop has the same jk with original data) iter_each <- sapply(pos_stop, function(i){ length(cb_model[[i]]$obj_path)-1 }) - lfsr_each <- sapply(pos_stop, function(i) cb_model[[i]]$stop_null < multicorrection_max) + lfsr_each <- sapply(pos_stop, function(i) cb_model[[i]]$stop_null < multi_test_max) pos_need_more <- which(iter_each <= 10 & lfsr_each) # pos_need_more <- which(iter_each <= 10) @@ -211,7 +211,7 @@ boost_check_stop <- function(cb_model, cb_model_para, pos_stop, } else if (stop_method == "lfsr" | stop_method == "lfdr"){ cb_model[[i]]$stop_null <- cb_model[[i]]$stop_null + 0.05 } - cb_model[[i]]$step <- cb_model[[i]]$step*0.5 + cb_model[[i]]$learning_rate_init <- cb_model[[i]]$learning_rate_init*0.5 } # stop update for outcome with > 10 iterations @@ -229,13 +229,13 @@ boost_check_stop <- function(cb_model, cb_model_para, pos_stop, boost_obj_last <- function(cb_data, cb_model, cb_model_para, tau = 0.01, - func_prior = "z2z", + func_simplex = "z2z", lambda = 0.5, - lambda_target = 1, - LD_obj = TRUE){ + lambda_target_outcome = 1, + LD_free = TRUE){ pos.stop <- cb_model_para$true_stop - target_idx <- cb_model_para$target_idx + target_outcome_idx <- cb_model_para$target_outcome_idx for (i in pos.stop){ # - check which jk update @@ -258,14 +258,14 @@ boost_obj_last <- function(cb_data, cb_model, cb_model_para, ld_feature <- sqrt(abs(ld_jk)) # - calculate delta - if (is.null(target_idx)){ + if (is.null(target_outcome_idx)){ lambda_outcome <- lambda } else { - lambda_outcome <- ifelse(i==target_idx, lambda_target, lambda) + lambda_outcome <- ifelse(i==target_outcome_idx, lambda_target_outcome, lambda) } delta <- boost_KL_delta(z = cb_model[[i]]$z, ld_feature = ld_feature, adj_dep = adj_dep, - func_prior = func_prior, lambda = lambda_outcome) + func_simplex = func_simplex, lambda = lambda_outcome) x_tmp <- cb_data$data[[X_dict]]$X scaling_factor <- if (!is.null(cb_data$data[[i]]$N)) (cb_data$data[[i]]$N - 1) else 1 @@ -275,7 +275,7 @@ boost_obj_last <- function(cb_data, cb_model, cb_model_para, cb_model[[i]]$res / scaling_factor } - obj_ld <- if (LD_obj) ld_feature else rep(1, length(ld_feature)) + obj_ld <- if (LD_free) ld_feature else rep(1, length(ld_feature)) if (length(cb_data$data[[i]]$variable_miss)!=0){ obj_ld[cb_data$data[[i]]$variable_miss] <- 0 } diff --git a/R/colocboost_utils.R b/R/colocboost_utils.R index 736fdd3..9d328b8 100644 --- a/R/colocboost_utils.R +++ b/R/colocboost_utils.R @@ -1,450 +1,445 @@ -get_integrated_weight <- function(avWeight, func_intw = "fun_R", alpha = 1.5){ - - if (func_intw == "prod"){ - av <- apply(avWeight, 1, function(w) prod(w)) - } else if (func_intw == "sqrt_prod"){ - av <- apply(avWeight, 1, function(w) prod(sqrt(w))) - } else if (func_intw == "fun_R"){ - av <- apply(avWeight, 1, function(w) prod(w^(alpha/ncol(avWeight)))) - } else { - stop("Please check func_intw! ") - } - return(av / sum(av)) -} +merge_cos_ucos <- function(cb_obj, out_cos, out_ucos, coverage = 0.95, + min_abs_corr = 0.5, tol = 1e-9, + median_cos_abs_corr = 0.8){ -get_in_csets <- function(weights, coverage = 0.95){ - - temp <- order(weights, decreasing=T) - csets <- temp[1:min(which(cumsum(weights[temp]) > coverage))] # 95% - return(list(csets)) - -} + change_obj_each <- out_ucos$change_obj_each + coloc_sets <- out_cos$cos$cos + ucos_each <- out_ucos$ucos_each -# - Fast calculate correlation matrix -get_cormat <- function(X, intercepte = FALSE){ - X = t(X) - # Center each variable - if (!intercepte){ - X = X - rowMeans(X) - } - # Standardize each variable - X = X / sqrt(rowSums(X^2)) - # Calculate correlations - cr = tcrossprod(X) - return(cr) -} + # - remove overlap between coloc_sets and single_sets + is_overlap <- is_highLD <- c() + for (i in 1:length(coloc_sets)){ + # cset1 <- coloc_sets[[i]] + coloc_outcome <- out_cos$cos$coloc_outcomes[[i]] + ttmp <- as.numeric(gsub("[^0-9.]+", "", colnames( out_cos$cos$avWeight[[i]]))) + pos_name <- order(ttmp) + out_cos$cos$avWeight[[i]] <- out_cos$cos$avWeight[[i]][,pos_name] + for (j in 1:length(ucos_each)){ + cset2 <- ucos_each[[j]] + fine_outcome <- out_ucos$ucos_outcome[j] + if (fine_outcome %in% coloc_outcome){ -#' @importFrom utils head tail -check_null_post <- function(cb_obj, - coloc_sets_temp, - coloc_outcomes, - check_null = 0.1, - check_null_method = "profile", - weaker_ucos = TRUE){ - - extract_last <- function(lst) { tail(lst, n = 1)} - extract_first <- function(lst) { head(lst, n = 1)} - - get_profile <- function(cs_beta, X = NULL, Y = NULL, N = NULL, - XtX = NULL, YtY = NULL, XtY = NULL, miss_idx, adj_dep = 1){ - - if (!is.null(X)){ - mean((Y-X%*%as.matrix(cs_beta))^2) *N/(N-1) - } else if (!is.null(XtY)){ - scaling_factor <- if (!is.null(N)) (N - 1) else 1 - yty <- YtY / scaling_factor - xtx <- XtX - if (length(miss_idx)!=0){ - xty <- XtY[-miss_idx] / scaling_factor - cs_beta <- cs_beta[-miss_idx] - } else { xty <- XtY / scaling_factor } - - ( yty - 2*sum(cs_beta*xty) + sum( (xtx %*% as.matrix(cs_beta)) * cs_beta ) ) * adj_dep - } - - } - - get_cs_obj <- function(cs_beta, res, tau, func_prior, lambda, adj_dep, LD_obj, - X = NULL, Y = NULL, N = NULL, - XtX = NULL, YtY = NULL, XtY = NULL, miss_idx = NULL){ - - correlation <- get_correlation(X = X, res = res, XtY = XtY, N = N, YtY = YtY, - XtX = XtX, beta_k = cs_beta, miss_idx = miss_idx) - z <- get_z(correlation, n=N, res) - abs_cor <- abs(correlation) - jk <- which(abs_cor == max(abs_cor)) - jk <- ifelse(length(jk) == 1, jk, sample(jk,1)) - P <- length(z) - ld_jk <- get_LD_jk(jk, X = X, XtX = XtX, N = N, - remain_idx = setdiff(1:P, miss_idx), P = P) - ld_feature <- sqrt(abs(ld_jk)) - # - calculate delta - delta <- boost_KL_delta(z = z, ld_feature = ld_feature, adj_dep=adj_dep, - func_prior = func_prior, lambda = lambda) - scaling_factor <- if (!is.null(N)) (N - 1) else 1 - cov_Xtr <- if(!is.null(X)){ - t(X)%*%res / scaling_factor - } else { - res / scaling_factor - } - obj_ld <- if (LD_obj) ld_feature else rep(1, length(ld_feature)) - obj_ld[miss_idx] <- 0 - exp_term <- adj_dep * obj_ld * (abs(cov_Xtr)) - return(tau * matrixStats::logSumExp(exp_term / tau + log(delta))) - - } - - update_res <- function(X = NULL, Y = NULL, XtX = NULL, XtY = NULL, N = NULL, cs_beta, miss_idx){ - if (!is.null(X)){ - return(Y - X%*%cs_beta) - } else if (!is.null(XtX)){ - scaling.factor <- if (!is.null(N)) N-1 else 1 - xtx <- XtX / scaling.factor - if (length(miss_idx)!=0){ - xty <- XtY[-miss_idx] / scaling.factor - res.tmp <- rep(0, length(XtY)) - res.tmp[-miss_idx] <- xty - xtx%*%cs_beta[-miss_idx] - } else { - xty <- XtY / scaling.factor - res.tmp <- xty - xtx%*%cs_beta - } - return(res.tmp) + # - if fine_y in coloc_y, we check only the overlap + pp <- which(coloc_outcome == fine_outcome) + w_tmp <- out_cos$cos$avWeight[[i]][,pp] + cset1 <- get_in_cos(w_tmp, coverage = coverage)[[1]] + # - addhoc: if median_cos_abs_corr > 0.8, remove single sets + X_dict <- cb_obj$cb_data$dict[fine_outcome] + res <- get_between_purity(cset1, cset2, X = cb_obj$cb_data$data[[X_dict]]$X, + Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, + N = cb_obj$cb_data$data[[fine_outcome]]$N, + miss_idx = cb_obj$cb_data$data[[fine_outcome]]$variable_miss, + P = cb_obj$cb_model_para$P) + # is.between <- length(intersect(cset1, cset2)) != 0 + is.between <- (abs(res[2]-1) < tol) + if (is.between){ + is_overlap = c(is_overlap, j) + } else { + is.higLD <- res[2] > median_cos_abs_corr + if (is.higLD){ is_highLD = c(is_highLD, j)} + } + + } else if (!(fine_outcome %in% coloc_outcome)){ + + # - if fine_y not in coloc_y, we check overlap and also min_purity + change_obj_coloc <- out_cos$cos$cs_change + cset1 <- coloc_sets[[i]] + res <- list() + for (ii in 1:cb_obj$cb_model_para$L){ + X_dict <- cb_obj$cb_data$dict[ii] + res[[ii]] <- get_between_purity(cset1, cset2, X = cb_obj$cb_data$data[[X_dict]]$X, + Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, + N = cb_obj$cb_data$data[[ii]]$N, + miss_idx = cb_obj$cb_data$data[[ii]]$variable_miss, + P = cb_obj$cb_model_para$P) + } + res <- Reduce(pmax, res) + min_between <- res[1] + max_between <- res[2] + ave_between <- res[3] + is.between <- ((min_between>median_cos_abs_corr) & (abs(max_between-1) < tol)) + # is.between <- (min_between>min_abs_corr) * (abs(max_between-1)median_cos_abs_corr) + if (is.between){ + is_overlap = c(is_overlap, j) + # -- add weight + add_avW <- out_ucos$avW_ucos_each[,j] + out_cos$cos$avWeight[[i]] <- cbind(add_avW, out_cos$cos$avWeight[[i]]) + colnames( out_cos$cos$avWeight[[i]])[1] <- paste0("outcome", fine_outcome) + ttmp <- as.numeric(gsub("[^0-9.]+", "", colnames( out_cos$cos$avWeight[[i]]))) + pos_name <- order(ttmp) + out_cos$cos$avWeight[[i]] <- out_cos$cos$avWeight[[i]][,pos_name] + change_obj_coloc_i <- change_obj_coloc[i,] + change_obj_each_j <- change_obj_each[j,] + out_cos$cos$cs_change[i,] <- pmax(change_obj_coloc_i, change_obj_each_j) + coloc_outcome <- sort(c(coloc_outcome, fine_outcome)) + out_cos$cos$coloc_outcomes[[i]] <- coloc_outcome + } + + } + } } - } - # - add hoc - cut <- if(length(cb_obj$cb_data)==1) 0.2 else 1 - - # ----- null filtering - cb_data <- cb_obj$cb_data - cs_change <- check_cs_change <- matrix(0, nrow = length(coloc_sets_temp), ncol = cb_obj$cb_model_para$L) - colnames(cs_change) <- colnames(check_cs_change) <- paste0("change_obj_", 1:cb_obj$cb_model_para$L) - max_change <- matrix(0, nrow = length(coloc_sets_temp), ncol = cb_obj$cb_model_para$L) - for (i in 1:length(coloc_sets_temp)){ - cs_variants <- as.numeric(unlist(coloc_sets_temp[[i]])) - for (j in coloc_outcomes){ - cs_beta <- cb_obj$cb_model[[j]]$beta - cs_beta[cs_variants] <- 0 - X_dict <- cb_data$dict[j] - adj_dep <- cb_data$data[[j]]$dependency - if (check_null_method == "profile"){ - cs_profile <- get_profile(cs_beta, X = cb_data$data[[X_dict]]$X, Y = cb_data$data[[j]]$Y, - XtX = cb_data$data[[X_dict]]$XtX, XtY = cb_data$data[[j]]$XtY, - YtY = cb_data$data[[j]]$YtY, N = cb_data$data[[j]]$N, - miss_idx = cb_data$data[[j]]$variable_miss, adj_dep = adj_dep) - last_profile <- extract_last(cb_obj$cb_model[[j]]$profile_loglike_each) - change <- abs(cs_profile - last_profile) - # - add hoc - if (min(cb_obj$cb_model[[j]]$multi_correction_univariate[cs_variants]) >= cut){ change <- 0 } - # ------ - # - check_null - check_cs_change[i,j] <- change / diff(range(cb_obj$cb_model[[j]]$profile_loglike_each)) - cs_change[i,j] <- change # / diff(range(cb_obj$cb_model[[j]]$profile_loglike_each)) - first_profile <- extract_first(cb_obj$cb_model[[j]]$profile_loglike_each) - max_change[i,j] <- (first_profile - last_profile) >= cb_obj$cb_model[[j]]$check_null_max - } else if (check_null_method == "obj"){ - res <- update_res(X = cb_data$data[[X_dict]]$X, Y = cb_data$data[[j]]$Y, - XtX = cb_data$data[[X_dict]]$XtX, XtY = cb_data$data[[j]]$XtY, - N = cb_data$data[[j]]$N, cs_beta, - miss_idx = cb_data$data[[j]]$variable_miss) - cs_obj <- get_cs_obj(cs_beta, res, cb_obj$cb_model_para$tau, cb_obj$cb_model_para$func_prior, - cb_obj$cb_model_para$lambda, adj_dep = adj_dep, - LD_obj = cb_obj$cb_model_para$LD_obj, - X = cb_data$data[[X_dict]]$X, N = cb_data$data[[j]]$N, - XtX = cb_data$data[[X_dict]]$XtX, XtY = cb_data$data[[X_dict]]$XtY, - YtY = cb_data$data[[X_dict]]$YtY, - miss_idx = cb_data$data[[j]]$variable_miss) - last_obj <- min(cb_obj$cb_model[[j]]$obj_path) - change <- abs(cs_obj - last_obj) - if (length(cb_obj$cb_model[[j]]$obj_path)==1){ - total_obj <- 1 + + is_overlap <- unique(c(is_overlap,is_highLD)) + if (length(is_overlap) != 0){ + if (length(is_overlap) == length(ucos_each)){ + out_ucos$ucos_each = NULL + out_ucos$avW_ucos_each <- NULL + out_ucos$change_obj_each <- NULL + out_ucos$purity_each <- NULL } else { - total_obj <- diff(range(cb_obj$cb_model[[j]]$obj_path)) + out_ucos$ucos_each = ucos_each[-is_overlap] + out_ucos$avW_ucos_each <- out_ucos$avW_ucos_each[,-is_overlap,drop = FALSE] + out_ucos$change_obj_each <- change_obj_each[-is_overlap,,drop = FALSE] + out_ucos$purity_each <- out_ucos$purity_each[-is_overlap,,drop = FALSE] } - check_cs_change[i,j] <- change / total_obj - cs_change[i,j] <- change - max_change[i,j] <- total_obj >= cb_obj$cb_model[[j]]$check_null_max - } } - } - if (!weaker_ucos){ - check_cs_change <- cs_change - check_null_tmp <- sapply(1:cb_obj$cb_model_para$L, function(j) cb_obj$cb_model[[j]]$check_null_max) - } else { check_null_tmp <- rep(check_null, cb_obj$cb_model_para$L) } - # check_tmp <- sapply(1:nrow(check_cs_change), function(ii) { - # check_cs_tmp <- check_cs_change[ii, ] > check_null - # check_max_change <- max_change[ii, ] - # pos <- which(check_max_change!=0) - # min(check_cs_tmp[pos]) - # }) - cs_change <- as.data.frame(cs_change * max_change) # changed - for (ii in 1:nrow(check_cs_change)){ - cs_tmp <- check_cs_change[ii,] - check_cs_change[ii,] <- (cs_tmp>check_null_tmp) - } - is_non_null <- which(rowSums( check_cs_change * max_change ) != 0) - # is_non_null <- which(rowSums( (cs_change >= check_null) * (max_change >= check_null_max) ) != 0) - # is_non_null <- which(rowSums(cs_change >= check_null) != 0) - - ll = list("cs_change" = cs_change, - "is_non_null" = is_non_null) - return(ll) + if (length( out_ucos$ucos_each) == 0){ + out_ucos$ucos_each = NULL + out_ucos$avW_ucos_each <- NULL + out_ucos$change_obj_each <- NULL + out_ucos$purity_each <- NULL + } + ll <- list("ucos" = out_ucos, "cos" = out_cos) + return(ll) + } #' @importFrom stats na.omit -get_purity <- function(pos, X=NULL, Xcorr=NULL, N = NULL, n = 100) { - get_upper_tri = Rfast::upper_tri - get_median = Rfast::med - - if (sum(is.na(pos))!=0){ - pos <- as.numeric(na.omit(pos)) - } - - if (length(pos) == 1) - return(c(1,1,1)) - else { - - # Subsample the columns if necessary. - if (length(pos) > n) - pos = sample(pos,n) - - if (is.null(Xcorr)) { - X_sub = X[,pos] - X_sub = as.matrix(X_sub) - corr <- suppressWarnings({get_cormat(X_sub)}) - corr[which(is.na(corr))] = 0 - value = abs(get_upper_tri(corr)) - } else { - Xcorr <- Xcorr # if (!is.null(N)) Xcorr/(N-1) else Xcorr - value = abs(get_upper_tri(Xcorr[pos,pos])) +merge_ucos <- function(cb_obj, past_out, + min_abs_corr = 0.5, + median_abs_corr = NULL, + n_purity = 100, + median_cos_abs_corr = 0.8, + tol = 1e-9){ + + + out_ucos <- past_out$ucos + out_cos <- past_out$cos + ucos_each <- out_ucos$ucos_each + change_obj_each <- out_ucos$change_obj_each + + # calculate between purity + ncsets <- length(ucos_each) + min_between <- max_between <- ave_between <- matrix(0, nrow = ncsets, ncol = ncsets) + for (i.between in 1:(ncsets-1)){ + for (j.between in (i.between+1):ncsets){ + cset1 <- ucos_each[[i.between]] + cset2 <- ucos_each[[j.between]] + y.i <- out_ucos$ucos_outcome[i.between] + y.j <- out_ucos$ucos_outcome[j.between] + if (y.i == y.j){ next } + yy <- c(y.i, y.j) + res <- list() + flag <- 1 + for (ii in yy){ + X_dict <- cb_obj$cb_data$dict[ii] + res[[flag]] <- get_between_purity(cset1, cset2, X = cb_obj$cb_data$data[[X_dict]]$X, + Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, + N = cb_obj$cb_data$data[[ii]]$N, + miss_idx = cb_obj$cb_data$data[[ii]]$variable_miss, + P = cb_obj$cb_model_para$P) + flag <- flag + 1 + } + if (min_abs_corr == 0){ + res <- Reduce(pmin, res) + } else { + res <- Reduce(pmax, res) + } + min_between[i.between, j.between] <- min_between[j.between, i.between] <- res[1] + max_between[i.between, j.between] <- max_between[j.between, i.between] <- res[2] + ave_between[i.between, j.between] <- ave_between[j.between, i.between] <- res[3] } - return(c(min(value), - sum(value)/length(value), - get_median(value))) } -} + # is.between <- (min_between>min_abs_corr) * (abs(max_between-1)median_cos_abs_corr) + is.between <- ((min_between>median_cos_abs_corr) & (abs(max_between-1) < tol)) + if (sum(is.between) != 0){ + temp <- sapply(1:nrow(is.between), function(x){ + tt <- c(x, which(is.between[x,] != 0)) + return(paste0(sort(tt), collapse = ";")) + }) + temp <- merge_sets(temp) + potential_merged <- lapply(temp, function(x) as.numeric(unlist(strsplit(x, ";")))) + potential_merged <- potential_merged[which(sapply(potential_merged, length) >= 2)] + coloc_sets_merged <- avWeight_merged <- + cs_change_merged <- coloc_outcomes_merged <- list() + is_merged <- c() + for (i.m in 1:length(potential_merged)){ + temp_set <- as.numeric(potential_merged[[i.m]]) + # refine avWeight + merged <- out_ucos$avW_ucos_each[, temp_set] + unique_coloc_outcomes <- as.numeric(gsub(".*Y(\\d+).*", "\\1", colnames(merged))) + if (length(unique(unique_coloc_outcomes))==1) next + # define merged set + coloc_sets_merged <- c(coloc_sets_merged, list( unique(unlist(ucos_each[temp_set])) )) + colnames(merged) <- paste0("outcome", unique_coloc_outcomes) + coloc_outcomes_merged <- c(coloc_outcomes_merged, + list(unique(sort(unique_coloc_outcomes)))) + # colnames(temp) <- unique_coloc_outcomes + avWeight_merged <- c(avWeight_merged, list(merged)) + # refine cs_change + change_cs_tmp <- change_obj_each[temp_set, , drop = FALSE] + cs_change_merged <- c(cs_change_merged, + list(apply(change_cs_tmp, 2, max))) + is_merged <- c(is_merged, temp_set) + } -# ------ Calculate modularity ---------- -get_modularity <- function(Weight, B){ - if (dim(Weight)[1] == 1){ - Q <- 0 - } else { - W_pos <- Weight * (Weight > 0) - W_neg <- Weight * (Weight < 0) - N <- dim(Weight)[1] - K_pos <- colSums(W_pos) - K_neg <- colSums(W_neg) - m_pos <- sum(K_pos) - m_neg <- sum(K_neg) - m <- m_pos + m_neg - cate <- B %*% t(B) - if (m_pos == 0 & m_neg == 0){ - Q <- 0 - } else { - if (m_pos == 0){ - Q_positive <- 0 - Q_negative <- sum((W_neg - K_neg %*% t(K_neg) / m_neg) * cate) / m_neg - } else if (m_neg == 0){ - Q_positive <- sum((W_pos - K_pos %*% t(K_pos) / m_pos) * cate) / m_pos - Q_negative <- 0 + if (length(is_merged) != 0){ + + # --- check merged coloc set purity + purity = vector(mode='list', length=length(coloc_sets_merged)) + for(ee in 1:length(coloc_sets_merged)){ + coloc_t <- coloc_outcomes_merged[[ee]] + p_tmp <- c() + for (i3 in coloc_t){ + pos <- coloc_sets_merged[[ee]] + X_dict <- cb_obj$cb_data$dict[i3] + if (!is.null(cb_obj$cb_data$data[[X_dict]]$XtX)){ + pos <- match(pos, setdiff(1:cb_obj$cb_model_para$P, cb_obj$cb_data$data[[i3]]$variable_miss)) + # - it could happen since merge_cos + if(sum(is.na(pos)) != 0){pos <- as.numeric(na.omit(pos))} + } + tmp <- matrix(get_purity(pos,X=cb_obj$cb_data$data[[X_dict]]$X, + Xcorr=cb_obj$cb_data$data[[X_dict]]$XtX, + N=cb_obj$cb_data$data[[i3]]$N,n=n_purity),1,3) + p_tmp <- rbind(p_tmp, tmp) + } + purity[[ee]] <- matrix(apply(p_tmp, 2, max),1,3) + + } + purity_all <- do.call(rbind, purity) + purity_all = as.data.frame(purity_all) + colnames(purity_all) = c("min_abs_corr","mean_abs_corr","median_abs_corr") + if (is.null(median_abs_corr)) { + is_pure = which(purity_all[,1] >= min_abs_corr) } else { - Q_positive <- sum((W_pos - K_pos %*% t(K_pos) / m_pos) * cate) / m_pos - Q_negative <- sum((W_neg - K_neg %*% t(K_neg) / m_neg) * cate) / m_neg + is_pure = which(purity_all[,1] >= min_abs_corr | purity_all[,3] >= median_abs_corr) + } + if (length(is_pure) > 0){ + # add coloc + merged_colocsets <- coloc_sets_merged[is_pure] + names(merged_colocsets) <- paste0("merged_cos", 1:length(is_pure)) + out_cos$cos$cos <- c( out_cos$cos$cos, + merged_colocsets) + out_cos$cos$purity <- rbind( out_cos$cos$purity, + purity_all[is_pure, ]) + out_cos$cos$evidence_strength <- c( out_cos$cos$evidence_strength, + rep(0.95,length(is_pure))) + cs_change_tmp <- do.call(rbind, cs_change_merged)[is_pure, , drop=FALSE] + colnames(cs_change_tmp) <- paste0("change_obj_", 1:cb_obj$cb_model_para$L) + rownames(cs_change_tmp) <- names(merged_colocsets) + out_cos$cos$cs_change <- rbind( out_cos$cos$cs_change, + cs_change_tmp) + out_cos$cos$avWeight <- c( out_cos$cos$avWeight, + avWeight_merged[is_pure]) + out_cos$cos$coloc_outcomes <- c( out_cos$cos$coloc_outcomes, + coloc_outcomes_merged[is_pure]) + } + + # - remove single + if (length(is_merged) == length(ucos_each)){ + out_ucos$ucos_each <- NULL + out_ucos$avW_ucos_each <- NULL + out_ucos$change_obj_each <- NULL + out_ucos$purity_each <- NULL + } else { + out_ucos$ucos_each <- ucos_each[-is_merged] + out_ucos$avW_ucos_each <- out_ucos$avW_ucos_each[, -is_merged, drop = FALSE] + out_ucos$change_obj_each <- change_obj_each[-is_merged, , drop = FALSE] + out_ucos$purity_each <- out_ucos$purity_each[-is_merged, , drop = FALSE] } } - Q <- m_pos / m * Q_positive - m_neg / m * Q_negative } + ll <- list("ucos" = out_ucos, "cos" = out_cos) + return(ll) } -#' @importFrom stats cutree -get_n_cluster <- function(hc, Sigma, m=ncol(Sigma), between_cluster = 0.8){ - if (min(Sigma) > between_cluster){ - IND = 1 - Q = 1 + +get_vcp <- function(past_out, P){ + if (!is.null(past_out$cos$cos$cos)){ + avW_coloc_vcp <- sapply(past_out$cos$cos$avWeight, get_integrated_weight) + } else {avW_coloc_vcp <- NULL} + all_weight <- avW_coloc_vcp + if (length(all_weight) == P){ + all_weight <- as.matrix(unlist(all_weight)) + } + if (!is.null(all_weight)){ + all_weight <- apply(all_weight, 2, as.numeric) + all_weight <- as.matrix(all_weight) + vcp <- as.vector(1 - apply(1 - all_weight,1,prod)) } else { - Q <- c() - if (ncol(Sigma) < 10){m = ncol(Sigma)} - for(i in 1:m) - { - index=cutree(hc,i) - B=sapply(1:i, function(t) as.numeric(index==t)) - Q[i] <- get_modularity(Sigma, B) - } - - IND=which(Q==max(Q)) - L=length(IND) - if (L>1) IND=IND[1] + vcp <- rep(0, P) } - return(list("n_cluster" = IND, - "Qmodularity" = Q)) + return(vcp) } -w_purity <- function(weights, X=NULL, Xcorr=NULL, N = NULL, n = 100, coverage = 0.95, - min_abs_corr = 0.5, median_abs_corr = NULL, miss_idx = NULL){ - - tmp <- apply(weights, 2, w_cs, coverage = coverage) - tmp_purity <- apply(tmp, 2, function(tt){ - pos <- which(tt == 1) - # deal with missing snp here - if (!is.null(Xcorr)){ - pos <- match(pos, setdiff(1:length(tmp), miss_idx)) + +get_pip <- function(past_out, R, P){ + if (length(past_out$cos$cos$cos)!=0){ + av_coloc <- do.call(cbind, past_out$cos$cos$avWeight) + } else {av_coloc = NULL} + if (length(past_out$ucos$ucos_each)!=0){ + av_noncoloc <- past_out$ucos$avW_ucos_each + tmp <- do.call(rbind, strsplit(colnames(av_noncoloc), ":")) + colnames(av_noncoloc) <- paste0("outcome", gsub("[^0-9.]+", "", tmp[,2])) + } else {av_noncoloc = NULL} + av_all <- cbind(av_coloc, av_noncoloc) + pip <- vector(mode='list', length=R) + if (!is.null(av_all)){ + av_name <- colnames(av_all) + for (i in 1:R){ + pos <- grep(i, av_name) + if (length(pos) != 0){ + av_i <- as.matrix(av_all[, pos]) + pip[[i]] <- as.vector(1 - apply(1-av_i,1,prod)) + } else { + pip[[i]] <- rep(0,P) + } } - get_purity(pos, X=X, Xcorr=Xcorr, N=N, n = n) - }) - if (is.null(median_abs_corr)) { - is_pure = which(tmp_purity[1,] >= min_abs_corr) - } else { - is_pure = which(tmp_purity[1,] >= min_abs_corr | tmp_purity[3,] >= median_abs_corr) } - return(is_pure) + return(pip) } +check_two_overlap_sets <- function(total, i, j){ + t1 <- total[[i]] + t2 <- total[[j]] + if (t1 != "ONE" & t2 != "ONE"){ + return(ifelse(t1>t2, i, j)) + } else if (t1 == "ONE" & t2 != "ONE"){ + return(i) + } else if (t1 != "ONE" & t2 == "ONE"){ + return(j) + } else if (t1 == "ONE" & t2 == "ONE"){ + return(sample(c(i,j), 1)) + } +} -#' @importFrom stats na.omit -# - Calculate purity between two confidence sets -get_between_purity <- function(pos1, pos2, X=NULL, Xcorr=NULL, N = NULL, miss_idx = NULL, P = NULL){ - - get_matrix_mult <- function(X_sub1, X_sub2){ - - X_sub1 = t(X_sub1) - X_sub2 = t(X_sub2) - # Standardize each variable - X_sub1 = X_sub1 / sqrt(rowSums(X_sub1^2)) - X_sub1[is.nan(X_sub1)] = 0 - X_sub2 = X_sub2 / sqrt(rowSums(X_sub2^2)) - X_sub2[is.nan(X_sub2)] = 0 - # Calculate correlations - cr = tcrossprod(X_sub1, X_sub2) - return(cr) - - } - get_median = Rfast::med - - if (is.null(Xcorr)){ - X_sub1 = X[,pos1,drop=FALSE] - X_sub2 = X[,pos2,drop=FALSE] - value <- abs(get_matrix_mult(X_sub1, X_sub2)) - - } else { - pos1 <- na.omit(match(pos1, setdiff(1:P, miss_idx))) - pos2 <- na.omit(match(pos2, setdiff(1:P, miss_idx))) - # scaling_factor <- if (!is.null(N)) N-1 else 1 - if (length(pos1)!=0 & length(pos2)!=0){ - value = abs(Xcorr[pos1,pos2]) # / scaling_factor - } else { - value = 0 + +# Function to merge overlapping sets +merge_sets <- function(vec) { + split_lists <- lapply(vec, function(x) as.numeric(unlist(strsplit(x, ";")))) + result <- list() + while (length(split_lists) > 0) { + current <- split_lists[[1]] + split_lists <- split_lists[-1] + repeat { + overlap_index <- NULL + for (i in seq_along(split_lists)) { + if (length(intersect(current, split_lists[[i]])) > 0) { + overlap_index <- i + break } - + } + if (!is.null(overlap_index)) { + current <- union(current, split_lists[[overlap_index]]) + split_lists <- split_lists[-overlap_index] + } else { + break + } } - return(c(min(value), max(value), get_median(value))) + result <- c(result, list(paste(sort(current), collapse = ";"))) + } + return(result) } -#' @importFrom stats var -#' @importFrom utils tail -get_cos_evidence <- function(cb_obj, coloc_out, data_info){ - - get_cos_config <- function(w, config_idx, alpha = 1.5, coverage = 0.95){ - - w_outcome <- colnames(w) - config_outcome <- paste0("outcome", config_idx) - pos <- which(w_outcome %in% config_outcome) - w_config <- w[, pos, drop=FALSE] - int_w <- get_integrated_weight(w_config, alpha = alpha) - unlist(get_in_csets(int_w, coverage = coverage)) - + + +get_avWeigth <- function(cb_model, coloc_outcomes, update, pos.coloc, name_weight=FALSE){ + + avWeight <- lapply(coloc_outcomes, function(i){ + pos <- which(update[i,] != 0) + weight <- cb_model[[i]]$weights_path[match(pos.coloc, pos), ] + }) + avWeight <- do.call(cbind, avWeight) + if (name_weight) { + colnames(avWeight) <- paste0("outcome", coloc_outcomes) } + return(avWeight) - get_cos_profile <- function(cs_beta, outcome_idx, X = NULL, Y = NULL, N = NULL, - XtX = NULL, YtY = NULL, XtY = NULL, miss_idx = NULL, adj_dep = 1){ - - if (!is.null(X)){ - cos_profile <- mean((Y-X%*%as.matrix(cs_beta))^2) *N/(N-1) - yty <- var(Y) - } else if (!is.null(XtY)){ - scaling_factor <- if (!is.null(N)) (N - 1) else 1 - yty <- YtY / scaling_factor - xtx <- XtX - if (length(miss_idx)!=0){ - xty <- XtY[-miss_idx] / scaling_factor - cs_beta <- cs_beta[-miss_idx] - } else { xty <- XtY / scaling_factor } - - cos_profile <- ( yty - 2*sum(cs_beta*xty) + sum( (xtx %*% as.matrix(cs_beta)) * cs_beta ) ) * adj_dep - } - delta <- yty - cos_profile - if (delta <= 0){ - warning(paste("Warning message: potential sumstat & LD mismatch may happens for outcome", outcome_idx, - ". Using logLR = CoS(profile) - max(profile). Please check our website.")) +} + + + +get_max_profile <- function(cb_obj, check_null_max=0.02, check_null_method = "profile"){ + for (i in 1:cb_obj$cb_model_para$L){ + cb <- cb_obj$cb_model[[i]] + scaling_factor <- if(!is.null(cb_obj$cb_data$data[[i]]$N)) cb_obj$cb_data$data[[i]]$N-1 else 1 + if (check_null_method == "profile"){ + cb$check_null_max <- 1000*check_null_max / scaling_factor + } else { + cb$check_null_max <- check_null_max } - cos_profile + cb_obj$cb_model[[i]] <- cb } + return(cb_obj) +} + + +### Function for check cs for each weight +w_cs <- function(w, coverage = 0.95){ + n <- sum(cumsum(sort(w,decreasing = TRUE)) < coverage) + 1 + o <- order(w,decreasing = TRUE) + result = rep(0,length(w)) + result[o[1:n]] = 1 + return(result) +} + +get_integrated_weight <- function(avWeight, weight_fudge_factor = 1.5){ + av <- apply(avWeight, 1, function(w) prod(w^(weight_fudge_factor/ncol(avWeight)))) + return(av / sum(av)) +} + +get_in_cos <- function(weights, coverage = 0.95){ - # - - get_outcome_profile_change <- function(outcome_idx, cos, cb_obj, check_null_max){ - extract_last <- function(lst) { tail(lst, n = 1)} - cb_data <- cb_obj$cb_data - cs_beta <- rep(0, cb_obj$cb_model_para$P) - cs_beta[cos] <- cb_obj$cb_model[[outcome_idx]]$beta[cos] - X_dict <- cb_data$dict[outcome_idx] - cos_profile <- get_cos_profile(cs_beta, outcome_idx, - X = cb_data$data[[X_dict]]$X, Y = cb_data$data[[outcome_idx]]$Y, - XtX = cb_data$data[[X_dict]]$XtX, XtY = cb_data$data[[outcome_idx]]$XtY, - YtY = cb_data$data[[outcome_idx]]$YtY, N = cb_data$data[[outcome_idx]]$N, - miss_idx = cb_data$data[[outcome_idx]]$variable_miss, - adj_dep = cb_data$data[[outcome_idx]]$dependency) - max_profile <- max(cb_obj$cb_model[[outcome_idx]]$profile_loglike_each) - ifelse(max_profile < cos_profile, 0, max_profile - cos_profile) - } + temp <- order(weights, decreasing=T) + csets <- temp[1:min(which(cumsum(weights[temp]) > coverage))] # 95% + return(list(csets)) - # - calculate best configuration likelihood explained by minimal configuration - get_normalization_evidence <- function(profile_change, null_max, outcomes, outcome_names) { - # Define the exponential likelihood ratio normalization (ELRN) - logLR_normalization <- function(ratio) { 1 - exp( - 2*ratio ) } - - ratio <- profile_change / null_max - prob <- logLR_normalization(ratio) - df <- data.frame(outcomes_index = outcomes, relative_logLR = ratio, npc_outcome = prob) - rownames(df) <- outcome_names[outcomes] - sorted_df <- df[order(-df$relative_logLR), ] - sorted_df$strong_effect <- sorted_df$relative_logLR >= 1 - return(sorted_df) +} + + +#' Pure R implementation (fallback) +#' @noRd +get_merge_ordered_with_indices <- function(vector_list) { + # Quick validation + if (!is.list(vector_list) || length(vector_list) == 0) { + stop("Input must be a non-empty list of vectors") } - get_npuc <- function(npc_outcome){ - # npuc_outcome <- sapply(1:length(npc_outcome), function(i) npc_outcome[i]*prod(1-npc_outcome[-i]) ) - # 1 - prod(1-npuc_outcome) - max_idx <- which.max(npc_outcome) - npc_outcome[max_idx] * prod(1 - npc_outcome[-max_idx]) + # Convert all vectors to character + vector_list <- lapply(vector_list, as.character) + n_vectors <- length(vector_list) + + # Estimate total and unique elements + total_elements <- sum(sapply(vector_list, length)) + + # Phase 1: Build merged vector + seen <- new.env(hash = TRUE, parent = emptyenv(), size = total_elements) + merged <- character(total_elements) # Pre-allocate maximum size + merge_idx <- 1 + + # Process each vector to create the merged vector + for (i in seq_len(n_vectors)) { + vec <- vector_list[[i]] + for (j in seq_along(vec)) { + elem <- vec[j] + if (!exists(elem, envir = seen, inherits = FALSE)) { + seen[[elem]] <- merge_idx # Store position directly (optimization) + merged[merge_idx] <- elem + merge_idx <- merge_idx + 1 + } + } } - avWeight <- coloc_out$avWeight - cs_change <- coloc_out$cs_change - check_null_max <- sapply(cb_obj$cb_model, function(cb) cb$check_null_max) - outcome_names <- data_info$outcome_info$outcome_names - n_cos <- length(avWeight) - npc <- c() # normalized probability for / of colocalization - normalization_evidence <- list() - for (i in 1:n_cos){ - w <- avWeight[[i]] - outcomes <- coloc_out$coloc_outcomes[[i]] - # most likely cos - cos <- get_cos_config(w, outcomes,alpha = cb_obj$cb_model_para$alpha,coverage = cb_obj$cb_model_para$coverage) - profile_change_outcome <- sapply(outcomes, function(outcome_idx){ - get_outcome_profile_change(outcome_idx, cos, cb_obj, check_null_max) - }) - normalization_evidence[[i]] <- get_normalization_evidence(profile_change = profile_change_outcome, - null_max = check_null_max[outcomes], - outcomes, outcome_names) - - # - calcualte CoS colocalization probability (CCP) and CoS uncolocalization probability (UCCP) - npuc <- get_npuc(normalization_evidence[[i]]$npc_outcome) - npc[i] <- 1 - npuc + # Trim merged result to actual size + merged_length <- merge_idx - 1 + if (merged_length < length(merged)) { + merged <- merged[1:merged_length] } - names(normalization_evidence) <- names(npc) <- names(coloc_out$cos) - return(list(normalization_evidence=normalization_evidence, npc=npc)) + + merged } - diff --git a/R/colocboost_workhorse.R b/R/colocboost_workhorse.R index d65db13..f5fa8f2 100644 --- a/R/colocboost_workhorse.R +++ b/R/colocboost_workhorse.R @@ -13,27 +13,27 @@ #' @export colocboost_workhorse <- function(cb_data, M=NULL, - step = 0.01, tau = 0.01, prioritize_jkstar = TRUE, - decayrate = 1, - func_prior = "z2z", + learning_rate_init = 0.01, + learning_rate_decay = 1, + func_simplex = "z2z", lambda = 0.5, - lambda_target = 1, - jk_equiv_cor = 0.8, + lambda_target_outcome = 1, + jk_equiv_corr = 0.8, jk_equiv_loglik = 1, stop_thresh = 1e-06, - func_multicorrection = "lfdr", + func_multi_test = "lfdr", stop_null = 0.05, - multicorrection_max = 1, - multicorrection_cut = 1, + multi_test_max = 1, + multi_test_thresh = 1, ash_prior = "normal", p.adjust.methods = "fdr", func_compare = "min_max", - coloc_thres = 0.1, - LD_obj = FALSE, - dynamic_step = TRUE, - target_idx = NULL, + coloc_thresh = 0.1, + LD_free = FALSE, + dynamic_learning_rate = TRUE, + target_outcome_idx = NULL, outcome_names = NULL){ @@ -41,44 +41,44 @@ colocboost_workhorse <- function(cb_data, stop("Input must from colocboost function!")} cb_model <- colocboost_init_model(cb_data, - step = step, + learning_rate_init = learning_rate_init, stop_thresh = stop_thresh, - func_multicorrection = func_multicorrection, + func_multi_test = func_multi_test, stop_null = stop_null, - multicorrection_max = multicorrection_max, + multi_test_max = multi_test_max, ash_prior = ash_prior, p.adjust.methods = p.adjust.methods, - target_idx = target_idx) + target_outcome_idx = target_outcome_idx) cb_model_para <- colocboost_init_para(cb_data, cb_model, tau=tau, - func_prior = func_prior, + func_simplex = func_simplex, lambda = lambda, - lambda_target = lambda_target, - multicorrection_cut = multicorrection_cut, - func_multicorrection = func_multicorrection, - LD_obj = LD_obj, + lambda_target_outcome = lambda_target_outcome, + multi_test_thresh = multi_test_thresh, + func_multi_test = func_multi_test, + LD_free = LD_free, outcome_names = outcome_names, - target_idx = target_idx) + target_outcome_idx = target_outcome_idx) if (is.null(M)){M <- cb_model_para$L*200} cb_model_para$M <- M - # - if multicorrection value > multicorrection cutoff for some outcomes, we will not update them. + # - if multi_test value > multi_test cutoff for some outcomes, we will not update them. if (!is.null(cb_model_para$true_stop)){ if (sum(cb_model_para$update_y == 1) == 0){ # - if all outcomes do not have signals, STOP - message(paste("Using multiple correction method :", func_multicorrection, - ". Stop ColocBoost since the no outcomes", target_idx, "have signals!")) + message(paste("Using multiple correction method :", func_multi_test, + ". Stop ColocBoost since the no outcomes", target_outcome_idx, "have signals!")) } else { - message(paste("Using multiple correction method :", func_multicorrection, + message(paste("Using multiple correction method :", func_multi_test, ". Outcome", paste(cb_model_para$true_stop, collapse = ", "), "for all variants are greater than", - multicorrection_cut, ". Will not update it!")) + multi_test_thresh, ". Will not update it!")) } - if (!is.null(target_idx) & (M!= 1)){ - if (sum(cb_model_para$true_stop==target_idx)!=0){ - message(paste("Stop ColocBoost since the target outcome", target_idx, "do not have signals!")) + if (!is.null(target_outcome_idx) & (M!= 1)){ + if (sum(cb_model_para$true_stop==target_outcome_idx)!=0){ + message(paste("Stop ColocBoost since the target outcome", target_outcome_idx, "do not have signals!")) cb_model_para$update_y <- 0 } } @@ -94,14 +94,14 @@ colocboost_workhorse <- function(cb_data, # single effect with or without LD matrix message("Running colocboost with assumption of one causal per outcome!") cb_obj <- colocboost_one_causal(cb_model, cb_model_para, cb_data, - jk_equiv_cor = jk_equiv_cor, + jk_equiv_corr = jk_equiv_corr, jk_equiv_loglik = jk_equiv_loglik, tau = tau, - decayrate = decayrate, - func_prior = func_prior, + learning_rate_decay = learning_rate_decay, + func_simplex = func_simplex, lambda = lambda, - lambda_target = lambda_target, - LD_obj = LD_obj) + lambda_target_outcome = lambda_target_outcome, + LD_free = LD_free) cb_obj$cb_model_para$coveraged = "one_causal" } else { @@ -115,22 +115,20 @@ colocboost_workhorse <- function(cb_data, # step 1: check which outcomes need to be updated at which variant cb_model_para <- colocboost_check_update_jk(cb_model, cb_model_para, cb_data, prioritize_jkstar = prioritize_jkstar, - jk_equiv_cor = jk_equiv_cor, + jk_equiv_corr = jk_equiv_corr, jk_equiv_loglik = jk_equiv_loglik, func_compare = func_compare, - coloc_thres = coloc_thres) + coloc_thresh = coloc_thresh) # step 2: gradient boosting for the updated outcomes cb_model <- colocboost_update(cb_model, cb_model_para, cb_data, tau = tau, - decayrate = decayrate, - func_prior = func_prior, + learning_rate_decay = learning_rate_decay, + func_simplex = func_simplex, lambda = lambda, - lambda_target = lambda_target, - LD_obj = LD_obj, - dynamic_step = dynamic_step) - - # print(paste("m: update", which(cb_model_para$update_temp$update_status != 0), "at", cb_model_para$update_temp$real_update_jk[which(cb_model_para$update_temp$update_status != 0)])) + lambda_target_outcome = lambda_target_outcome, + LD_free = LD_free, + dynamic_learning_rate = dynamic_learning_rate) # step 3: check stop for the updated ones # # - update cb_model and cb_model_parameter @@ -160,7 +158,7 @@ colocboost_workhorse <- function(cb_data, cb_model[[i]]$profile_loglike_each[M_i-1] < cb_model[[i]]$stop_thresh # stop2 = tail(abs(diff(cb_model[[i]]$obj_path)), n=1) < cb_model[[i]]$stop_thresh multiple_correction <- get_multiple_correction(z=cb_model[[i]]$z, miss_idx = cb_data$data[[i]]$variable_miss, - func_multicorrection = func_multicorrection, + func_multi_test = func_multi_test, ash_prior = ash_prior, p.adjust.methods = p.adjust.methods) cb_model[[i]]$multi_correction <- multiple_correction @@ -173,8 +171,8 @@ colocboost_workhorse <- function(cb_data, } } } - # if (!is.null(target_idx)){ - # if (!is.na(stop[target_idx]) & stop[target_idx]){ stop = TRUE } + # if (!is.null(target_outcome_idx)){ + # if (!is.na(stop[target_outcome_idx]) & stop[target_outcome_idx]){ stop = TRUE } # } if (all(length(stop)==1 & stop)){ @@ -191,7 +189,7 @@ colocboost_workhorse <- function(cb_data, pos_stop <- which(stop) # which outcome reach the stop criterion ttmp <- boost_check_stop(cb_model, cb_model_para, pos_stop, - multicorrection_max = multicorrection_max) + multi_test_max = multi_test_max) cb_model_para <- ttmp$cb_model_para cb_model <- ttmp$cb_model # - if there is some outcomes need stop @@ -200,15 +198,15 @@ colocboost_workhorse <- function(cb_data, # calculate objective function of Y for the last iteration. cb_model <- boost_obj_last(cb_data, cb_model, cb_model_para, tau = tau, - func_prior = func_prior, + func_simplex = func_simplex, lambda = lambda, - lambda_target = lambda_target) - if ( !is.null(target_idx) ){ - if (target_idx %in% cb_model_para$true_stop){ - message(paste("Boosting iterations for target outcome", target_idx, + lambda_target_outcome = lambda_target_outcome) + if ( !is.null(target_outcome_idx) ){ + if (target_outcome_idx %in% cb_model_para$true_stop){ + message(paste("Boosting iterations for target outcome", target_outcome_idx, "converge after", m, "iterations!")) - if (length(setdiff(cb_model_para$true_stop, target_idx))!=0){ - message(paste("Boosting iterations for outcome", paste(setdiff(cb_model_para$true_stop, target_idx), collapse = ", "), + if (length(setdiff(cb_model_para$true_stop, target_outcome_idx))!=0){ + message(paste("Boosting iterations for outcome", paste(setdiff(cb_model_para$true_stop, target_outcome_idx), collapse = ", "), "converge after", m, "iterations!")) } } else { @@ -240,9 +238,9 @@ colocboost_workhorse <- function(cb_data, # calculate objective function of Y for the last iteration. cb_model <- boost_obj_last(cb_data, cb_model, cb_model_para, tau = tau, - func_prior = func_prior, + func_simplex = func_simplex, lambda = lambda, - lambda_target = lambda_target) + lambda_target_outcome = lambda_target_outcome) warning(paste("COLOC-BOOST updates did not converge in", M, "iterations; checkpoint at last iteration")) cb_model_para$coveraged = FALSE } diff --git a/man/colocboost.Rd b/man/colocboost.Rd index 2729875..d10c19f 100644 --- a/man/colocboost.Rd +++ b/man/colocboost.Rd @@ -12,53 +12,52 @@ colocboost( dict_YX = NULL, dict_sumstatLD = NULL, outcome_names = NULL, + target_outcome_idx = NULL, + target_outcome_variables = TRUE, + overlap_variables = FALSE, + intercept = TRUE, + standardize = TRUE, effect_est = NULL, effect_se = NULL, effect_n = NULL, M = NULL, stop_thresh = 1e-06, - step = 0.01, - decayrate = 1, tau = 0.01, + learning_rate_init = 0.01, + learning_rate_decay = 1, + dynamic_learning_rate = TRUE, prioritize_jkstar = TRUE, func_compare = "min_max", - jk_equiv_cor = 0.8, + jk_equiv_corr = 0.8, jk_equiv_loglik = 1, - coloc_thres = 0.1, - intercept = TRUE, - standardize = TRUE, + coloc_thresh = 0.1, + lambda = 0.5, + lambda_target_outcome = 1, + func_simplex = "LD_z2z", + func_multi_test = "lfdr", + stop_null = 1, + multi_test_max = 1, + multi_test_thresh = 1, + ash_prior = "normal", + p.adjust.methods = "fdr", + residual_correlation = NULL, coverage = 0.95, - between_cluster = 0.8, + min_cluster_corr = 0.8, dedup = TRUE, overlap = TRUE, n_purity = 100, min_abs_corr = 0.5, median_abs_corr = NULL, - between_purity = 0.8, + median_cos_abs_corr = 0.8, tol = 1e-09, - merging = TRUE, - coverage_singlew = 0.8, - lambda = 0.5, - lambda_target = 1, - func_intw = "fun_R", - alpha = 1.5, - func_prior = "LD_z2z", - func_multicorrection = "lfdr", - target_idx = NULL, - target_variables = TRUE, - overlap_variables = FALSE, - stop_null = 1, - multicorrection_max = 1, - multicorrection_cut = 1, - ash_prior = "normal", - p.adjust.methods = "fdr", + merge_cos = TRUE, + sec_coverage_thresh = 0.8, + weight_fudge_factor = 1.5, check_null = 0.1, check_null_method = "profile", check_null_max = 0.02, - residual_correlation = NULL, - LD_obj = FALSE, - dynamic_step = TRUE, - weaker_ucos = TRUE, + weaker_effect = TRUE, + LD_free = FALSE, output_level = 1 ) } @@ -87,40 +86,67 @@ The innovation: do not provide the same matrix in \code{LD} to reduce the comput \item{outcome_names}{The names of outcomes, which has the same order for Y.} +\item{target_outcome_idx}{The index of the target outcome if perform GWAS-xQTL ColocBoost} + +\item{target_outcome_variables}{If \code{target_outcome_variables = TRUE}, only consider the variables exist in the target outcome.} + +\item{overlap_variables}{If \code{overlap_variables = TRUE}, only perform colocalization in the overlapped region.} + +\item{intercept}{If \code{intercept = TRUE}, the intercept is fitted. Setting \code{intercept = FALSE} is generally not recommended.} + +\item{standardize}{If \code{standardize = TRUE}, standardize the columns of genotype and outcomes to unit variance.} + \item{effect_est}{Matrix of variable regression coefficients (i.e. regression beta values) in the genomic region} \item{effect_se}{Matrix of standard errors associated with the beta values} -\item{effect_n}{A scalar or a vector of sample sizes for estimating regression coefficients. Highly recommendated!} +\item{effect_n}{A scalar or a vector of sample sizes for estimating regression coefficients. Highly recommended!} -\item{M}{The maximum number of gradient boosting iterations. If the number of outcomes are large, it will be automatically increased to a larger number.} +\item{M}{The maximum number of gradient boosting rounds. If the number of outcomes are large, it will be automatically increased to a larger number.} \item{stop_thresh}{The stop criterion for overall profile loglikelihood function.} -\item{step}{The minimum step size (learning rate) for updating in each iteration.} +\item{tau}{The smooth parameter for proximity adaptive smoothing weights for the best update jk-star.} -\item{decayrate}{The decayrate for step size. If the objective function is large at the early iterations, -we need to have the higher step size to improve the computational efficiency.} +\item{learning_rate_init}{The minimum learning rate for updating in each iteration.} -\item{tau}{The smooth parameter for proximity adaptive smoothing weights for the best update jk-star.} +\item{learning_rate_decay}{The decayrate for learning rate. If the objective function is large at the early iterations, +we need to have the higher learning rate to improve the computational efficiency.} \item{prioritize_jkstar}{When \code{prioritize_jkstar = TRUE}, the selected outcomes will prioritize best update j_k^star in SEC.} \item{func_compare}{The criterion when we update jk-star in SEC (default is "min_max").} -\item{jk_equiv_cor}{The LD cutoff between overall best update jk-star and marginal best update jk-l for lth outcome} +\item{jk_equiv_corr}{The LD cutoff between overall best update jk-star and marginal best update jk-l for lth outcome} \item{jk_equiv_loglik}{The change of loglikelihood cutoff between overall best update jk-star and marginal best update jk-l for lth outcome} -\item{coloc_thres}{The cutoff of checking if the best update jk-star is the potential causal variable for outcome l if jk-l is not similar to jk-star (used in Delayed SEC).} +\item{coloc_thresh}{The cutoff of checking if the best update jk-star is the potential causal variable for outcome l if jk-l is not similar to jk-star (used in Delayed SEC).} -\item{intercept}{If \code{intercept = TRUE}, the intercept is fitted. Setting \code{intercept = FALSE} is generally not recommended.} +\item{lambda}{The ratio \link{0,1} for z^2 and z in fun_prior simplex, defult is 0.5} -\item{standardize}{If \code{standardize = TRUE}, standardize the columns of genotype and outcomes to unit variance.} +\item{lambda_target_outcome}{The ratio for z^2 and z in fun_prior simplex for the target outcome, default is 1} + +\item{func_simplex}{The data-driven local association simplex \eqn{\delta} for smoothing the weights. Default is "LD_z2z" is the elastic net for z-score and also weighted by LD.} + +\item{func_multi_test}{The alternative method to check the stop criteria. When \code{func_multi_test = "lfdr"}, boosting iterations will be stopped +if the local FDR for all variables are greater than \code{lfsr_max}.} + +\item{stop_null}{The cutoff of nominal p-value when \code{func_multi_test = "Z"}.} + +\item{multi_test_max}{The cutoff of the smallest FDR for pre-filtering the outcomes when \code{func_multi_test = "lfdr"} or \code{func_multi_test = "lfsr"}.} + +\item{multi_test_thresh}{The cutoff of the smallest FDR for stop criteria when \code{func_multi_test = "lfdr"} or \code{func_multi_test = "lfsr"}.} + +\item{ash_prior}{The prior distribution for calculating lfsr when \code{func_multi_test = "lfsr"}.} + +\item{p.adjust.methods}{The adjusted pvalue method in stats:p.adj when \code{func_multi_test = "fdr"}} + +\item{residual_correlation}{The residual correlation based on the sample overlap, it is diagonal if it is NULL.} \item{coverage}{A number between 0 and 1 specifying the \dQuote{coverage} of the estimated colocalization confidence sets (CoS) (default is 0.95).} -\item{between_cluster}{The small correlation for the weights distributions across different iterations to be decided having only one cluster.} +\item{min_cluster_corr}{The small correlation for the weights distributions across different iterations to be decided having only one cluster.} \item{dedup}{If \code{dedup = TRUE}, the duplicate confidence sets will be removed in the post-processing.} @@ -136,43 +162,16 @@ which is a commonly used threshold for genotype data in genetic studies.} threshold will be filtered out and not reported. When both min_abs_corr and median_abs_corr are set, a CS will only be removed if it fails both filters. Default set to NULL but it is recommended to set it to 0.8 in practice.} -\item{between_purity}{Minimum absolute correlation allowed to merge multiple colocalized sets. The default is 0.8 corresponding to a stringent threshold +\item{median_cos_abs_corr}{Median absolute correlation between variants allowed to merge multiple colocalized sets. The default is 0.8 corresponding to a stringent threshold to merge colocalized sets, which may resulting in a huge set.} \item{tol}{A small, non-negative number specifying the convergence tolerance for checking the overlap of the variables in different sets.} -\item{merging}{When \code{merging = TRUE}, the sets for only one outcome will be merged if passed the \code{between_purity}.} - -\item{coverage_singlew}{A number between 0 and 1 specifying the weight in each SEC (default is 0.8).} - -\item{lambda}{The ratio \link{0,1} for z^2 and z in fun_prior simplex, defult is 0.5} +\item{merge_cos}{When \code{merge_cos = TRUE}, the sets for only one outcome will be merged if passed the \code{median_cos_abs_corr}.} -\item{lambda_target}{The ratio for z^2 and z in fun_prior simplex for the target outcome, default is 1} +\item{sec_coverage_thresh}{A number between 0 and 1 specifying the weight in each SEC (default is 0.8).} -\item{func_intw}{The integrated weight method. The default is "fun_R", indicating the same log-scale for different colocalized outcomes.} - -\item{alpha}{The strenght to integrate weight from differnt outcomes, default is 1.5} - -\item{func_prior}{The data-drive local association simplex \eqn{\delta} for smoothing the weights. Default is "LD_z2z" is the elastic net for z-score and also weighted by LD.} - -\item{func_multicorrection}{The alternative method to check the stop criteria. When \code{func_multicorrection = "lfdr"}, boosting iterations will be stopped -if the local FDR for all variables are greater than \code{lfsr_max}.} - -\item{target_idx}{The index of the target outcome if perform targeted ColocBoost} - -\item{target_variables}{If \code{target_variables = TRUE}, only consider the variables exsit in the target outcome.} - -\item{overlap_variables}{If \code{overlap_variables = TRUE}, only perform colocalization in the overlapped region.} - -\item{stop_null}{The cutoff of nominal p-value when \code{func_multicorrection = "Z"}.} - -\item{multicorrection_max}{The cutoff of the smallest FDR for pre-filtering the outcomes when \code{func_multicorrection = "lfdr"} or \code{func_multicorrection = "lfsr"}.} - -\item{multicorrection_cut}{The cutoff of the smallest FDR for stop criteria when \code{func_multicorrection = "lfdr"} or \code{func_multicorrection = "lfsr"}.} - -\item{ash_prior}{The prior distribution for calculating lfsr when \code{func_multicorrection = "lfsr"}.} - -\item{p.adjust.methods}{The adjusted pvalue method in stats:p.adj when \code{func_multicorrection = "fdr"}} +\item{weight_fudge_factor}{The strenght to integrate weight from differnt outcomes, default is 1.5} \item{check_null}{The cut off value for change conditional objective function. Default is 0.1.} @@ -180,11 +179,9 @@ if the local FDR for all variables are greater than \code{lfsr_max}.} \item{check_null_max}{The smallest value of change of profile loglikelihood for each outcome.} -\item{residual_correlation}{The residual correlation based on the sample overlap, it is diagonal if it is NULL.} +\item{weaker_effect}{If \code{weaker_effect = TRUE}, consider the weaker single effect due to coupling effects} -\item{LD_obj}{When \code{LD_obj = FALSE}, objective fuunction doesn't include LD information.} - -\item{weaker_ucos}{If \code{weaker_ucos = TRUE}, consider the weaker single effect due to coupling effects} +\item{LD_free}{When \code{LD_free = FALSE}, objective function doesn't include LD information.} \item{output_level}{When \code{output_level = 2}, return the ucos details for the single specific effects. When \code{output_level = 3}, return the entire Colocboost model to diagnostic results (more space).} @@ -193,9 +190,10 @@ When \code{output_level = 3}, return the entire Colocboost model to diagnostic r A \code{"colocboost"} object with some or all of the following elements: \item{cos_summary}{A summary table for colocalization events.} -\item{cos_details}{A object with all information for colocalization results.} \item{vcp}{The variable colocalized probability for each variable.} +\item{cos_details}{A object with all information for colocalization results.} \item{data_info}{A object with detailed information from input data} +\item{model_info}{A object with detailed information for colocboost model} } \description{ \code{colocboost} implements a proximity adaptive smoothing gradient boosting approach for multi-trait colocalization at gene loci, @@ -218,7 +216,7 @@ that may arise from small sample sizes or discrepancies in minor allele frequenc NA } -\section{Post-processing Parameters}{ +\section{Post Inference Parameters}{ NA } diff --git a/man/colocboost_assemble.Rd b/man/colocboost_assemble.Rd index bee2625..1a6210f 100644 --- a/man/colocboost_assemble.Rd +++ b/man/colocboost_assemble.Rd @@ -7,8 +7,7 @@ colocboost_assemble( cb_obj, coverage = 0.95, - func_intw = "fun_R", - alpha = 1.5, + weight_fudge_factor = 1.5, check_null = 0.1, check_null_method = "profile", check_null_max = 2e-05, @@ -16,12 +15,12 @@ colocboost_assemble( overlap = TRUE, n_purity = 100, min_abs_corr = 0.5, - coverage_singlew = 0.8, + sec_coverage_thresh = 0.8, median_abs_corr = NULL, - between_cluster = 0.8, - between_purity = 0.8, - weaker_ucos = TRUE, - merging = TRUE, + min_cluster_corr = 0.8, + median_cos_abs_corr = 0.8, + weaker_effect = TRUE, + merge_cos = TRUE, tol = 1e-09, output_level = 1 ) @@ -36,11 +35,11 @@ Colocalization signal - \code{colocboost_assemble_cos} - identify the colocalize Un-colocalization signal - \code{colocboost_assemble_ucos} - identify the causal confidence sets for each outcome only. -Add-hoc merging functions including +Add-hoc merge_cos functions including \itemize{ -\item{merge_coloc_single}{merge the colocalized sets and the single causal set if pass the \code{between_purity}} -\item{merge_single}{merge the single causal sets for different outcomes if pass the \code{between_purity}} +\item{merge_coloc_single}{merge the colocalized sets and the single causal set if pass the \code{median_cos_abs_corr}} +\item{merge_single}{merge the single causal sets for different outcomes if pass the \code{median_cos_abs_corr}} } Refine of the colocalization sets (TO-DO-LIST) diff --git a/man/colocboost_check_update_jk.Rd b/man/colocboost_check_update_jk.Rd index 71ded79..5c61363 100644 --- a/man/colocboost_check_update_jk.Rd +++ b/man/colocboost_check_update_jk.Rd @@ -9,10 +9,10 @@ colocboost_check_update_jk( cb_model_para, cb_data, prioritize_jkstar = TRUE, - jk_equiv_cor = 0.8, + jk_equiv_corr = 0.8, jk_equiv_loglik = 1, func_compare = "min_max", - coloc_thres = 0.1 + coloc_thresh = 0.1 ) } \value{ diff --git a/man/colocboost_get_methods.Rd b/man/colocboost_get_methods.Rd deleted file mode 100644 index 29ae5dc..0000000 --- a/man/colocboost_get_methods.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/colocboost_output.R -\name{colocboost_get_methods} -\alias{colocboost_get_methods} -\title{Set of functions for post inferences from ColocBoost} -\usage{ -colocboost_get_methods() -} -\description{ -The \code{colocboost_get_methods} functions access basic properties inferences from a fitted ColocBoost model. This documentation serves as a summary for all related post-inference functions. -} -\details{ -The following functions are included in this set: -\code{get_summary_table} get the colocalization summary table with or without the specific outcomes. - -These functions are not exported individually and are accessed via \code{colocboost_get_methods}. -} -\keyword{cb_post_inference} diff --git a/man/colocboost_plot.Rd b/man/colocboost_plot.Rd new file mode 100644 index 0000000..3dc5817 --- /dev/null +++ b/man/colocboost_plot.Rd @@ -0,0 +1,102 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/colocboost_plot.R +\name{colocboost_plot} +\alias{colocboost_plot} +\title{Plot visualization plot from a ColocBoost output.} +\usage{ +colocboost_plot( + cb_output, + y = "log10p", + pos = NULL, + plot_target_only = FALSE, + plot_cos_idx = NULL, + outcome_idx = NULL, + points_color = "grey80", + cos_color = NULL, + add_vertical = FALSE, + add_vertical_idx = NULL, + outcome_names = NULL, + plot_cols = 2, + variant_coord = FALSE, + show_hits = FALSE, + show_cos_to_uncoloc = FALSE, + show_cos_to_uncoloc_idx = NULL, + show_cos_to_uncoloc_outcome = NULL, + gene_name = NULL, + ylim_each = TRUE, + outcome_legend_pos = "top", + outcome_legend_size = 1.2, + cos_legend_pos = "bottomleft", + show_variable = FALSE, + lab_style = c(2, 1), + axis_style = c(2, 1), + title_style = c(2.5, 2), + ... +) +} +\arguments{ +\item{y}{Specifies the y-axis values, default is "log10p" for -log10 transformed marginal association p-values.} + +\item{pos}{Optional plotting range of x-axis to zoom in to a specific region.} + +\item{plot_target_only}{Logical, if TRUE only plots colocalization with target outcome, default is FALSE.} + +\item{plot_cos_idx}{Optional indices of CoS to plot} + +\item{outcome_idx}{Optional indices of outcomes to include in the plot. \code{outcome_idx=NULL} to plot only the outcomes having colocalization.} + +\item{points_color}{Background color for non-colocalized variables, default is "grey80".} + +\item{cos_color}{Optional custom colors for CoS.} + +\item{add_vertical}{Logical, if TRUE adds vertical lines at specified positions, default is FALSE} + +\item{add_vertical_idx}{Optional indices for vertical lines.} + +\item{outcome_names}{Optional vector of outcomes names for the subtitle of each figure. \code{outcome_names=NULL} for the outcome name shown in \code{data_info}.} + +\item{plot_cols}{Number of columns in the plot grid, default is 2. If you have many colocalization. please consider increasing this.} + +\item{variant_coord}{Logical, if TRUE uses variant coordinates on x-axis, default is FALSE. This is required the variable names including position information.} + +\item{show_hits}{Logical, if TRUE shows top variables for each CoS, default is FALSE} + +\item{show_cos_to_uncoloc}{Logical, if TRUE shows colocalization to uncolocalized outcomes to diagnose, default is FALSE} + +\item{show_cos_to_uncoloc_idx}{Optional indices for showing CoS to all uncolocalized outcomes} + +\item{show_cos_to_uncoloc_outcome}{Optional outcomes for showing CoS to uncolocalized outcomes} + +\item{gene_name}{Optional gene name to display in plot title} + +\item{ylim_each}{Logical, if TRUE uses separate y-axis limits for each plot, default is TRUE} + +\item{outcome_legend_pos}{Position for outcome legend, default is "top"} + +\item{outcome_legend_size}{Size for outcome legend text, default is 1.2} + +\item{cos_legend_pos}{Position for colocalization set legend, default is "bottomleft"} + +\item{show_variable}{Logical, if TRUE displays variant IDs, default is FALSE} + +\item{lab_style}{Vector of two numbers for label style (size, boldness), default is c(2, 1)} + +\item{axis_style}{Vector of two numbers for axis style (size, boldness), default is c(2, 1)} + +\item{title_style}{Vector of two numbers for title style (size, boldness), default is c(2.5, 2)} + +\item{...}{Additional parameters passed to \code{plot} functions} + +\item{Output}{object from \code{colocboost} analysis} +} +\value{ +Visualization plot for each colcoalization event. +} +\description{ +\code{colocboost_plot} generates visualization plots for colocalization events from a ColocBoost analysis. +} +\examples{ +colocboost_plot(cb_output) + +} +\keyword{cb_plot} diff --git a/man/colocboost_post_inference.Rd b/man/colocboost_post_inference.Rd new file mode 100644 index 0000000..d1f72c1 --- /dev/null +++ b/man/colocboost_post_inference.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/colocboost_inference.R +\name{colocboost_post_inference} +\alias{colocboost_post_inference} +\title{Set of functions for post inferences from ColocBoost} +\usage{ +colocboost_post_inference() +} +\description{ +The \code{colocboost_post_inference} functions access basic properties inferences from a fitted ColocBoost model. This documentation serves as a summary for all related post-inference functions. +} +\details{ +The following functions are included in this set: +\code{get_abc} get the colocalization summary table with or without the specific outcomes. + +These functions are not exported individually and are accessed via \code{colocboost_post_inference}. +} +\keyword{cb_post_inference} diff --git a/man/colocboost_update.Rd b/man/colocboost_update.Rd index 22cb5cd..8ec5624 100644 --- a/man/colocboost_update.Rd +++ b/man/colocboost_update.Rd @@ -9,12 +9,12 @@ colocboost_update( cb_model_para, cb_data, tau = 0.01, - decayrate = 1, - func_prior = "z2z", + learning_rate_decay = 1, + func_simplex = "z2z", lambda = 0.5, - lambda_target = 1, - LD_obj = FALSE, - dynamic_step = TRUE + lambda_target_outcome = 1, + LD_free = FALSE, + dynamic_learning_rate = TRUE ) } \value{ diff --git a/man/colocboost_workhorse.Rd b/man/colocboost_workhorse.Rd index 951e83f..71e5ef1 100644 --- a/man/colocboost_workhorse.Rd +++ b/man/colocboost_workhorse.Rd @@ -7,27 +7,27 @@ colocboost_workhorse( cb_data, M = NULL, - step = 0.01, tau = 0.01, prioritize_jkstar = TRUE, - decayrate = 1, - func_prior = "z2z", + learning_rate_init = 0.01, + learning_rate_decay = 1, + func_simplex = "z2z", lambda = 0.5, - lambda_target = 1, - jk_equiv_cor = 0.8, + lambda_target_outcome = 1, + jk_equiv_corr = 0.8, jk_equiv_loglik = 1, stop_thresh = 1e-06, - func_multicorrection = "lfdr", + func_multi_test = "lfdr", stop_null = 0.05, - multicorrection_max = 1, - multicorrection_cut = 1, + multi_test_max = 1, + multi_test_thresh = 1, ash_prior = "normal", p.adjust.methods = "fdr", func_compare = "min_max", - coloc_thres = 0.1, - LD_obj = FALSE, - dynamic_step = TRUE, - target_idx = NULL, + coloc_thresh = 0.1, + LD_free = FALSE, + dynamic_learning_rate = TRUE, + target_outcome_idx = NULL, outcome_names = NULL ) } diff --git a/man/get_cos_summary.Rd b/man/get_cos_summary.Rd new file mode 100644 index 0000000..e6f84a5 --- /dev/null +++ b/man/get_cos_summary.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/colocboost_output.R +\name{get_cos_summary} +\alias{get_cos_summary} +\title{Get colocalization summary table from a ColocBoost output.} +\usage{ +get_cos_summary( + cb_output, + outcome_names = NULL, + interest_outcome = NULL, + gene_name = NULL +) +} +\arguments{ +\item{cb_output}{Output object from \code{colocboost} analysis} + +\item{outcome_names}{Optional vector of names of outcomes, which has the same order as Y in the original analysis.} + +\item{interest_outcome}{Optional vector specifying a subset of outcomes from \code{outcome_names} to focus on. When provided, only colocalization events that include at least one of these outcomes will be returned.} + +\item{gene_name}{Optional character string. When provided, adds a column with this gene name to the output table for easier filtering in downstream analyses.} +} +\value{ +A summary table for colocalization events with the following columns: +\item{target_outcome}{The target outcome being analyzed if exists. Otherwise, it is \code{FALSE}.} +\item{colocalized_outcomes}{Colocalized outcomes for colocalization confidence set (CoS) } +\item{cos_id}{Unique identifier for colocalization confidence set (CoS) } +\item{purity}{Minimum absolute correlation of variables with in colocalization confidence set (CoS) } +\item{top_variable}{The variable with highest variant colocalization probability (VCP) } +\item{top_variable_vcp}{Variant colocalization probability for the top variable} +\item{cos_npc}{Normalized probability of colocalization} +\item{min_npc_outcome}{Minimum normalized probability of colocalized traits} +\item{n_variables}{Number of variables in colocalization confidence set (CoS)} +\item{colocalized_index}{Indices of colocalized variables} +\item{colocalized_variables}{List of colocalized variables} +\item{colocalized_variables_vcp}{Variant colocalization probabilities for all colocalized variables} +} +\description{ +\code{get_cos_summary} get the colocalization summary table with or without the outcomes of interest. +} +\examples{ +get_cos_summary(cb_output) +get_cos_summary(cb_output, interest_outcome = c("Y1", "Y2")) + +} +\keyword{cb_get_functions} diff --git a/man/get_strong_colocalization.Rd b/man/get_strong_colocalization.Rd new file mode 100644 index 0000000..68a483d --- /dev/null +++ b/man/get_strong_colocalization.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/colocboost_output.R +\name{get_strong_colocalization} +\alias{get_strong_colocalization} +\title{Get colocalization summary table from a ColocBoost output.} +\usage{ +get_strong_colocalization( + cb_output, + cos_npc_cutoff = 0.5, + npc_outcome_cutoff = 0.25, + pvalue_cutoff = NULL, + weight_fudge_factor = 1.5, + coverage = 0.95 +) +} +\arguments{ +\item{cb_output}{Output object from \code{colocboost} analysis} + +\item{cos_npc_cutoff}{Minimum threshold of normalized probability of colocalization (NPC) for CoS.} + +\item{npc_outcome_cutoff}{Minimum threshold of normalized probability of colocalized traits in each CoS.} + +\item{pvalue_cutoff}{Maximum threshold of margianl p-values of colocalized variants on colocalized traits in each CoS.} + +\item{weight_fudge_factor}{The strenght to integrate weight from differnt outcomes, default is 1.5} + +\item{coverage}{A number between 0 and 1 specifying the \dQuote{coverage} of the estimated colocalization confidence sets (CoS) (default is 0.95).} +} +\value{ +A \code{"colocboost"} object with some or all of the following elements: + +\item{cos_summary}{A summary table for colocalization events.} +\item{vcp}{The variable colocalized probability for each variable.} +\item{cos_details}{A object with all information for colocalization results.} +\item{data_info}{A object with detailed information from input data} +\item{model_info}{A object with detailed information for colocboost model} +} +\description{ +\code{get_strong_colocalization} get the colocalization by discarding the weaker colocalization events or colocalized outcomes +} +\examples{ +get_strong_colocalization(cb_output, cos_npc_cutoff = 0.5, npc_outcome_cutoff = 0.25) + +} +\keyword{cb_get_functions}