diff --git a/R/colocboost.R b/R/colocboost.R index 4c5f651..81e8490 100644 --- a/R/colocboost.R +++ b/R/colocboost.R @@ -82,7 +82,8 @@ #' @param weight_fudge_factor The strength to integrate weight from different outcomes, default is 1.5 #' @param check_null The cut off value for change conditional objective function. Default is 0.1. #' @param check_null_method The metric to check the null sets. Default is "profile" -#' @param check_null_max The smallest value of change of profile loglikelihood for each outcome. +#' @param check_null_max The smallest value of change of profile loglikelihood for each outcome in CoS. +#' @param check_null_max_ucos The smallest value of change of profile loglikelihood for each outcome in uCoS. #' @param weaker_effect If \code{weaker_effect = TRUE}, consider the weaker single effect due to coupling effects #' @param LD_free When \code{LD_free = FALSE}, objective function doesn't include LD information. #' @param output_level When \code{output_level = 1}, return basic cos details for colocalization results @@ -180,6 +181,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data check_null = 0.1, # the cut off value for change conditional objective function check_null_method = "profile", # the metric to check the null sets. check_null_max = 0.025, # the smallest value of change of profile loglikelihood for each outcome. + check_null_max_ucos = 0.015, # the smallest value of change of profile loglikelihood for each outcome in uCoS. weaker_effect = TRUE, LD_free = FALSE, output_level = 1) { @@ -398,23 +400,9 @@ colocboost <- function(X = NULL, Y = NULL, # individual data "Without LD, only a single iteration will be performed under the assumption of one causal variable per outcome. ", "Additionally, the purity of CoS cannot be evaluated!" ) - - p.sumstat <- sapply(keep_variable_sumstat, length) - p.unique <- unique(p.sumstat) - if (length(p.unique) == 1) { - ld <- diag(1, nrow = p.unique) - colnames(ld) <- rownames(ld) <- keep_variable_sumstat[[1]] - LD <- list(ld) - sumstatLD_dict <- rep(1, length(sumstat)) - } else { - LD <- lapply(keep_variable_sumstat, function(sn) { - ld <- diag(1, nrow = length(sn)) - colnames(ld) <- rownames(ld) <- sn - return(ld) - }) - sumstatLD_dict <- 1:length(sumstat) - } - + + LD <- 1 + sumstatLD_dict <- rep(1, length(sumstat)) # change some algorithm parameters M <- 1 # one iteration min_abs_corr <- 0 # remove purity checking @@ -428,6 +416,12 @@ colocboost <- function(X = NULL, Y = NULL, # individual data if (is.matrix(LD)) { LD <- list(LD) } + # - check if NA in LD matrix + num_na <- sapply(LD, sum) + if (any(is.na(num_na))){ + warning("Error: Input LD must not contain missing values (NA).") + return(NULL) + } if (length(LD) == 1) { sumstatLD_dict <- rep(1, length(sumstat)) } else if (length(LD) == length(sumstat)) { @@ -616,6 +610,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data check_null = check_null, check_null_method = check_null_method, check_null_max = check_null_max, + check_null_max_ucos = check_null_max_ucos, dedup = dedup, overlap = overlap, n_purity = n_purity, @@ -632,3 +627,4 @@ colocboost <- function(X = NULL, Y = NULL, # individual data class(cb_output) <- "colocboost" return(cb_output) } + diff --git a/R/colocboost_assemble.R b/R/colocboost_assemble.R index 9ccf7cd..aba5405 100644 --- a/R/colocboost_assemble.R +++ b/R/colocboost_assemble.R @@ -25,6 +25,7 @@ colocboost_assemble <- function(cb_obj, check_null = 0.1, check_null_method = "profile", check_null_max = 0.025, + check_null_max_ucos = 0.015, dedup = TRUE, overlap = TRUE, n_purity = 100, @@ -71,11 +72,17 @@ colocboost_assemble <- function(cb_obj, } } } else { - if (cb_obj$cb_model_para$M == 1) { + if (cb_obj$cb_model_para$model_used == "LD_free") { + # fixme later check_null_method <- "obj" - check_null <- check_null_max <- 0.1 + check_null_max <- check_null * check_null + } else if (cb_obj$cb_model_para$model_used == "one_causal"){ + # fixme later + check_null_max <- check_null_max * check_null } - cb_obj <- get_max_profile(cb_obj, check_null_max = check_null_max, check_null_method = check_null_method) + cb_obj <- get_max_profile(cb_obj, check_null_max = check_null_max, + check_null_max_ucos = check_null_max_ucos, + check_null_method = check_null_method) # --------- about colocalized confidence sets --------------------------------- out_cos <- colocboost_assemble_cos(cb_obj, coverage = coverage, @@ -202,10 +209,6 @@ colocboost_assemble <- function(cb_obj, # - colocalization results cb_obj$cb_model_para$weight_fudge_factor <- weight_fudge_factor cb_obj$cb_model_para$coverage <- coverage - if (cb_obj$cb_model_para$M == 1) { - # fixme for single iteration model - cb_obj <- get_max_profile(cb_obj, check_null_max = 0.01, check_null_method = "profile") - } cos_results <- get_cos_details(cb_obj, coloc_out = past_out$cos$cos, data_info = data_info) cb_output <- list( "vcp" = cos_results$vcp, diff --git a/R/colocboost_check_update_jk.R b/R/colocboost_check_update_jk.R index 9e78c14..9fbcdb2 100644 --- a/R/colocboost_check_update_jk.R +++ b/R/colocboost_check_update_jk.R @@ -498,10 +498,13 @@ get_LD_jk1_jk2 <- function(jk1, jk2, LD_temp[which(is.na(LD_temp))] <- 0 LD_temp <- LD_temp[1, 2] } else if (!is.null(XtX)) { - jk1.remain <- which(remain_jk == jk1) - jk2.remain <- which(remain_jk == jk2) - # scaling <- if (!is.null(N)) N-1 else 1 - LD_temp <- XtX[jk1.remain, jk2.remain] # / scaling + if (sum(XtX) == 1){ + LD_temp <- 0 + } else { + jk1.remain <- which(remain_jk == jk1) + jk2.remain <- which(remain_jk == jk2) + LD_temp <- XtX[jk1.remain, jk2.remain] + } } return(LD_temp) } @@ -595,7 +598,11 @@ estimate_change_profile_res <- function(jk, } else { xty <- XtY / scaling_factor } - rtr <- yty - 2 * sum(beta_k * xty) + sum((xtx %*% as.matrix(beta_k)) * beta_k) + if (sum(xtx) == 1){ + rtr <- yty - 2 * sum(beta_k * xty) + sum(beta_k^2) + } else { + rtr <- yty - 2 * sum(beta_k * xty) + sum((xtx %*% as.matrix(beta_k)) * beta_k) + } } numerator <- xtr^2 / (2 * rtr) denominator <- 0.5 * log(2 * pi * rtr) + 0.5 diff --git a/R/colocboost_inference.R b/R/colocboost_inference.R index bc709b5..5226db0 100644 --- a/R/colocboost_inference.R +++ b/R/colocboost_inference.R @@ -207,6 +207,7 @@ w_purity <- function(weights, X = NULL, Xcorr = NULL, N = NULL, n = 100, coverag return(is_pure) } + #' Function to remove the spurious signals #' @importFrom utils head tail #' @keywords cb_post_inference @@ -240,8 +241,11 @@ check_null_post <- function(cb_obj, } else { xty <- XtY / scaling_factor } - - (yty - 2 * sum(cs_beta * xty) + sum((xtx %*% as.matrix(cs_beta)) * cs_beta)) * adj_dep + if (sum(xtx) == 1){ + (yty - 2 * sum(cs_beta * xty) + sum(cs_beta^2)) * adj_dep + } else { + (yty - 2 * sum(cs_beta * xty) + sum((xtx %*% as.matrix(cs_beta)) * cs_beta)) * adj_dep + } } } @@ -289,10 +293,18 @@ check_null_post <- function(cb_obj, if (length(miss_idx) != 0) { xty <- XtY[-miss_idx] / scaling.factor res.tmp <- rep(0, length(XtY)) - res.tmp[-miss_idx] <- xty - xtx %*% (cs_beta[-miss_idx] / beta_scaling) + if (sum(xtx) == 1){ + res.tmp[-miss_idx] <- xty - cs_beta[-miss_idx] / beta_scaling + } else { + res.tmp[-miss_idx] <- xty - xtx %*% (cs_beta[-miss_idx] / beta_scaling) + } } else { xty <- XtY / scaling.factor - res.tmp <- xty - xtx %*% (cs_beta / beta_scaling) + if (sum(xtx) == 1){ + res.tmp <- xty - (cs_beta / beta_scaling) + } else { + res.tmp <- xty - xtx %*% (cs_beta / beta_scaling) + } } return(res.tmp) } @@ -349,7 +361,7 @@ check_null_post <- function(cb_obj, last_obj <- min(cb_obj$cb_model[[j]]$obj_path) change <- abs(cs_obj - last_obj) if (length(cb_obj$cb_model[[j]]$obj_path) == 1) { - total_obj <- 1 + total_obj <- change } else { total_obj <- diff(range(cb_obj$cb_model[[j]]$obj_path)) } @@ -408,8 +420,12 @@ get_purity <- function(pos, X = NULL, Xcorr = NULL, N = NULL, n = 100) { corr[which(is.na(corr))] <- 0 value <- abs(get_upper_tri(corr)) } else { - Xcorr <- Xcorr # if (!is.null(N)) Xcorr/(N-1) else Xcorr - value <- abs(get_upper_tri(Xcorr[pos, pos])) + if (sum(Xcorr) == 1){ + value <- 0 + } else { + Xcorr <- Xcorr # if (!is.null(N)) Xcorr/(N-1) else Xcorr + value <- abs(get_upper_tri(Xcorr[pos, pos])) + } } return(c( min(value), @@ -443,14 +459,18 @@ get_between_purity <- function(pos1, pos2, X = NULL, Xcorr = NULL, miss_idx = NU X_sub2 <- scale(X[, pos2, drop = FALSE], center = T, scale = F) value <- abs(get_matrix_mult(X_sub1, X_sub2)) } else { - if (length(miss_idx)!=0){ - pos1 <- na.omit(match(pos1, setdiff(1:P, miss_idx))) - pos2 <- na.omit(match(pos2, setdiff(1:P, miss_idx))) - } - if (length(pos1) != 0 & length(pos2) != 0) { - value <- abs(Xcorr[pos1, pos2]) - } else { + if (sum(Xcorr)==1){ value <- 0 + } else { + if (length(miss_idx)!=0){ + pos1 <- na.omit(match(pos1, setdiff(1:P, miss_idx))) + pos2 <- na.omit(match(pos2, setdiff(1:P, miss_idx))) + } + if (length(pos1) != 0 & length(pos2) != 0) { + value <- abs(Xcorr[pos1, pos2]) + } else { + value <- 0 + } } } return(c(min(value), max(value), get_median(value))) @@ -488,8 +508,11 @@ get_cos_evidence <- function(cb_obj, coloc_out, data_info) { } else { xty <- XtY / scaling_factor } - - cos_profile <- (yty - 2 * sum(cs_beta * xty) + sum((xtx %*% as.matrix(cs_beta)) * cs_beta)) * adj_dep + if (sum(xtx) == 1){ + cos_profile <- (yty - 2 * sum(cs_beta * xty) + sum(cs_beta^2)) * adj_dep + } else { + cos_profile <- (yty - 2 * sum(cs_beta * xty) + sum((xtx %*% as.matrix(cs_beta)) * cs_beta)) * adj_dep + } } delta <- yty - cos_profile if (delta <= 0) { diff --git a/R/colocboost_init.R b/R/colocboost_init.R index 76c6ceb..bd278ab 100644 --- a/R/colocboost_init.R +++ b/R/colocboost_init.R @@ -39,22 +39,43 @@ colocboost_init_data <- function(X, Y, dict_YX, ################# initialization ####################################### cb_data <- list("data" = list()) class(cb_data) <- "colocboost" + # if (!is.null(dict_YX) & !is.null(dict_sumstatLD)) { + # dict <- c(dict_YX, max(dict_YX) + dict_sumstatLD) + # n_ind_variable <- max(dict_YX) + # } else if (!is.null(dict_YX) & is.null(dict_sumstatLD)) { + # dict <- dict_YX + # } else if (is.null(dict_YX) & !is.null(dict_sumstatLD)) { + # dict <- dict_sumstatLD + # n_ind_variable <- 0 + # } + + # note here and remove later: dict is designed by X and LD, but we need additional dictionary for keep variables. + # keep variables for individual level data is based on X - there is no issue. + # keep variables for sumstat data is based on sumstat (not LD) - there is a issue to index the focal outcome based on dict later. if (!is.null(dict_YX) & !is.null(dict_sumstatLD)) { dict <- c(dict_YX, max(dict_YX) + dict_sumstatLD) n_ind_variable <- max(dict_YX) + dict_keep_variables <- c(dict_YX, 1:length(dict_sumstatLD) + n_ind_variable) } else if (!is.null(dict_YX) & is.null(dict_sumstatLD)) { dict <- dict_YX + dict_keep_variables <- dict_YX } else if (is.null(dict_YX) & !is.null(dict_sumstatLD)) { dict <- dict_sumstatLD n_ind_variable <- 0 + dict_keep_variables <- 1:length(dict_sumstatLD) } + if (focal_outcome_variables & !is.null(focal_outcome_idx)) { if (focal_outcome_idx > length(dict)) { stop("Target outcome index is over the total number of outcomes! please check!") } - keep_variable_names <- keep_variables[[dict[focal_outcome_idx]]] + # keep_variable_names <- keep_variables[[dict[focal_outcome_idx]]] + keep_variable_names <- keep_variables[[dict_keep_variables[focal_outcome_idx]]] if (overlap_variables) { - keep_tmp <- lapply(keep_variables[-dict[focal_outcome_idx]], function(tm) { + # keep_tmp <- lapply(keep_variables[-dict[focal_outcome_idx]], function(tm) { + # intersect(keep_variable_names, tm) + # }) + keep_tmp <- lapply(keep_variables[-dict_keep_variables[focal_outcome_idx]], function(tm) { intersect(keep_variable_names, tm) }) keep_variable_names <- Reduce(union, keep_tmp) @@ -299,7 +320,8 @@ colocboost_init_para <- function(cb_data, cb_model, tau = 0.01, "num_updates_outcome" = num_updates_outcome, "func_multi_test" = func_multi_test, "multi_test_thresh" = multi_test_thresh, - "multi_test_max" = multi_test_max + "multi_test_max" = multi_test_max, + "model_used" = "original" ) class(cb_model_para) <- "colocboost" @@ -374,7 +396,11 @@ get_correlation <- function(X = NULL, res = NULL, XtY = NULL, N = NULL, Xtr <- res / scaling_factor XtY <- XtY / scaling_factor } - var_r <- YtY - 2 * sum(beta_k * XtY) + sum((XtX %*% as.matrix(beta_k)) * beta_k) + if (sum(XtX) == 1){ + var_r <- YtY - 2 * sum(beta_k * XtY) + sum(beta_k^2) + } else { + var_r <- YtY - 2 * sum(beta_k * XtY) + sum((XtX %*% as.matrix(beta_k)) * beta_k) + } if (var_r > 1e-6) { corr_nomiss <- Xtr / sqrt(var_r) if (length(miss_idx) != 0) { @@ -425,15 +451,35 @@ get_lfdr <- function(z, miss_idx = NULL) { z <- z[-miss_idx] try_run <- 1 while (try_run == 1 && lambda_max >= 0.05) { - result <- try( + # result <- try( + # { + # lfdr_nomissing <- qvalue(pchisq(drop(z^2), 1, lower.tail = FALSE), lambda = seq(0.05, lambda_max, 0.05))$lfdr + # }, + # silent = TRUE + # ) + # if (inherits(result, "try-error")) { + # lambda_max <- lambda_max - 0.05 # Decrement lambda_max if error occurs + # } else { + # try_run <- 0 + # } + result <- tryCatch( { - lfdr_nomissing <- qvalue(pchisq(drop(z^2), 1, lower.tail = FALSE), lambda = seq(0.05, lambda_max, 0.05))$lfdr + lfdr_nomissing <- qvalue(pchisq(drop(z^2), 1, lower.tail = FALSE), + lambda = seq(0.05, lambda_max, 0.05))$lfdr + list(success = TRUE, value = lfdr_nomissing) }, - silent = TRUE + warning = function(w) { + list(success = FALSE, message = w$message) + }, + error = function(e) { + list(success = FALSE, message = e$message) + } ) - if (inherits(result, "try-error")) { - lambda_max <- lambda_max - 0.05 # Decrement lambda_max if error occurs + + if (!result$success) { + lambda_max <- lambda_max - 0.05 } else { + lfdr_nomissing <- result$value try_run <- 0 } } @@ -573,21 +619,24 @@ process_sumstat <- function(Z, N, Var_y, SeBhat, ld_matrices, variant_lists, dic Z_extend[pos_target] <- current_z[pos_z] # Calculate submatrix for each unique entry (not duplicates) - ld_submatrix <- NULL - - if (length(common_variants) > 0) { - # Only include the submatrix if this entry is unique or is the first occurrence - if (i == unified_dict[i]) { - # Check if common_variants and rownames have identical order - if (identical(common_variants, rownames(current_ld_matrix))) { - # If order is identical, use the matrix directly without reordering - ld_submatrix <- current_ld_matrix - } else { - # If order is different, reorder using matched indices - matched_indices <- match(common_variants, rownames(current_ld_matrix)) - ld_submatrix <- current_ld_matrix[matched_indices, matched_indices, drop = FALSE] - rownames(ld_submatrix) <- common_variants - colnames(ld_submatrix) <- common_variants + if (sum(current_ld_matrix) == 1){ + ld_submatrix <- current_ld_matrix + } else { + ld_submatrix <- NULL + if (length(common_variants) > 0) { + # Only include the submatrix if this entry is unique or is the first occurrence + if (i == unified_dict[i]) { + # Check if common_variants and rownames have identical order + if (identical(common_variants, rownames(current_ld_matrix))) { + # If order is identical, use the matrix directly without reordering + ld_submatrix <- current_ld_matrix + } else { + # If order is different, reorder using matched indices + matched_indices <- match(common_variants, rownames(current_ld_matrix)) + ld_submatrix <- current_ld_matrix[matched_indices, matched_indices, drop = FALSE] + rownames(ld_submatrix) <- common_variants + colnames(ld_submatrix) <- common_variants + } } } } diff --git a/R/colocboost_one_causal.R b/R/colocboost_one_causal.R index 0d70f1d..0ed96b5 100644 --- a/R/colocboost_one_causal.R +++ b/R/colocboost_one_causal.R @@ -17,9 +17,10 @@ colocboost_one_causal <- function(cb_model, cb_model_para, cb_data) { if (cb_model_para$jk_equiv_corr != 0) { cb_obj <- colocboost_one_iteration(cb_model, cb_model_para, cb_data) - + cb_obj$cb_model_para$model_used <- "one_causal" } else { cb_obj <- colocboost_diagLD(cb_model, cb_model_para, cb_data) + cb_obj$cb_model_para$model_used <- "LD_free" } return(cb_obj) } diff --git a/R/colocboost_output.R b/R/colocboost_output.R index d22b6e0..87617e8 100644 --- a/R/colocboost_output.R +++ b/R/colocboost_output.R @@ -838,7 +838,7 @@ get_ucos_summary <- function(cb_output, outcome_names = NULL, region_name = NULL "overlap_idx", "overlap_variables", "n_recalibrated_variables", "recalibrated_index", "recalibrated_variables", "recalibrated_variables_vcp" ) - + ambiguous_summary <- as.data.frame(ambiguous_summary) ambiguous_summary[, 1] <- unlist(sapply(ambiguous_results, function(tmp) paste0(tmp$ambiguous_cos_outcomes$outcome_name, collapse = "; "))) ambiguous_summary[, 2] <- names(ambiguous_results) ambiguous_summary[, 3] <- sapply(ambiguous_results, function(tmp) max(tmp$ambigous_cos_purity$min_abs_cor[lower.tri(tmp$ambigous_cos_purity$min_abs_cor)]) ) diff --git a/R/colocboost_plot.R b/R/colocboost_plot.R index 9381837..90141d6 100644 --- a/R/colocboost_plot.R +++ b/R/colocboost_plot.R @@ -29,7 +29,7 @@ #' @param ylim_each Logical, if TRUE uses separate y-axis limits for each plot, default is TRUE #' @param outcome_legend_pos Position for outcome legend, default is "top" #' @param outcome_legend_size Size for outcome legend text, default is 1.2 -#' @param cos_legend_pos Proportion of the legend from (left edge, bottom edge), default as (0.05, 0.4) at the left - median position +#' @param cos_legend_pos Proportion of the legend from (left edge, bottom edge) following by the distance for multiple labels (horizontal-x space, vertical-y space), default as (0.05, 0.4, 0.1, 0.5) at the left - median position #' @param show_variable Logical, if TRUE displays variant IDs, default is FALSE #' @param lab_style Vector of two numbers for label style (size, boldness), default is c(2, 1) #' @param axis_style Vector of two numbers for axis style (size, boldness), default is c(2, 1) @@ -93,7 +93,7 @@ colocboost_plot <- function(cb_output, y = "log10p", ylim_each = TRUE, outcome_legend_pos = "top", outcome_legend_size = 1.8, - cos_legend_pos = c(0.05, 0.4), + cos_legend_pos = c(0.05, 0.4, 0.1, 0.5), show_variable = FALSE, lab_style = c(2, 1), axis_style = c(2, 1), @@ -301,7 +301,7 @@ colocboost_plot <- function(cb_output, y = "log10p", y_pos <- usr[3] + cb_plot_init$cos_legend_pos[2] * (usr[4] - usr[3]) # 50% from bottom edge legend(x = x_pos, y = y_pos, texts, bty = "n", col = shape_col, text.col = texts_col, - cex = 1.5, pt.cex = 1.5, pch = 4, x.intersp = 0.1, y.intersp = 0.5 + cex = 1.5, pt.cex = 1.5, pch = 4, x.intersp = cos_legend_pos[3], y.intersp = cos_legend_pos[4] ) } } @@ -638,19 +638,55 @@ plot_initial <- function(cb_plot_input, y = "log10p", # - set background point color and cos color pools args$bg <- points_color if (is.null(cos_color)) { - # cos_color <- c("dodgerblue2", "#6A3D9A", "#FF7F00", "#FB9A99", "#33A02C", - # "#A6CEE3", "gold1", "#01665E","#FDBF6F", "#CAB2D6", "#B2DF8A", - # "#8C510A", "#BF812D", "#DFC27D", "#F6E8C3", "#01665E", - # "#35978F", "#80CDC1", "#C7EAE5", "#003C30") - # cos_color <- c("#1B9E77", "#D95F02", "#7570B3", "#1F78B4", "#66A61E", - # "#E6AB02", "#A6761D", "#666666", "#E7298A", "#B2182B", - # "#D73027", "#4575B4", "#313695", "#542788", "#74ADD1", - # "#F46D43", "#4DAF4A", "#984EA3", "#FF7F00", "#A50F15") + + # cos_color <- c( + # "#377EB8", "#E69F00", "#33A02C", "#984EA3", "#F46D43", + # "#A65628", "#1F4E79", "#B2182B", "#D73027", "#F781BF", + # "#4DAF4A", "#E41A1C", "#FF7F00", "#6A3D9A", "#1B7837", + # "#E6AB02", "#542788", "#74ADD1", "#A50F15", "#01665E" + # ) cos_color <- c( + # Original 20 colors (preserved exactly) "#377EB8", "#E69F00", "#33A02C", "#984EA3", "#F46D43", "#A65628", "#1F4E79", "#B2182B", "#D73027", "#F781BF", "#4DAF4A", "#E41A1C", "#FF7F00", "#6A3D9A", "#1B7837", - "#E6AB02", "#542788", "#74ADD1", "#A50F15", "#01665E" + "#E6AB02", "#542788", "#74ADD1", "#A50F15", "#01665E", + + # Additional 80 colors - carefully selected to complement the original palette + # Blues and Teals + "#5DADE2", "#85C1E9", "#2E86C1", "#1B4F72", "#154360", + "#AED6F1", "#3498DB", "#21618C", "#1A5490", "#17A2B8", + "#20B2AA", "#008B8B", "#4682B4", "#87CEEB", "#B0E0E6", + + # Greens + "#58D68D", "#82E0AA", "#27AE60", "#1E8449", "#186A3B", + "#ABEBC6", "#2ECC71", "#229954", "#196F3D", "#52C41A", + "#90EE90", "#32CD32", "#228B22", "#006400", "#9ACD32", + + # Reds and Pinks + "#EC7063", "#F1948A", "#CB4335", "#A93226", "#922B21", + "#FADBD8", "#E74C3C", "#C0392B", "#943126", "#FF6B6B", + "#FF69B4", "#FFB6C1", "#FF1493", "#DC143C", "#B22222", + + # Oranges and Yellows + "#F8C471", "#FAD7A0", "#E67E22", "#CA6F1E", "#B7950B", + "#FCF3CF", "#F39C12", "#D68910", "#B7950B", "#FFA500", + "#FFFF00", "#FFD700", "#FFA500", "#FF8C00", "#DAA520", + + # Purples and Violets + "#BB8FCE", "#D2B4DE", "#8E44AD", "#7D3C98", "#6C3483", + "#E8DAEF", "#9B59B6", "#7E57C2", "#673AB7", "#9C27B0", + "#8A2BE2", "#9370DB", "#BA55D3", "#DA70D6", "#DDA0DD", + + # Browns and Earth Tones + "#CD853F", "#D2691E", "#A0522D", "#8B4513", "#654321", + "#DEB887", "#F4A460", "#BC8F8F", "#D2B48C", "#F5DEB3", + "#8B7355", "#A0522D", "#CD853F", "#DEB887", "#D2691E", + + # Grays and Neutrals + "#85929E", "#AEB6BF", "#5D6D7E", "#34495E", "#2C3E50", + "#D5D8DC", "#BDC3C7", "#95A5A6", "#7F8C8D", "#17202A", + "#808080", "#A9A9A9", "#C0C0C0", "#696969", "#778899" ) diff --git a/R/colocboost_update.R b/R/colocboost_update.R index 21b1bb5..5858a3e 100644 --- a/R/colocboost_update.R +++ b/R/colocboost_update.R @@ -111,16 +111,29 @@ colocboost_update <- function(cb_model, cb_model_para, cb_data) { if (length(cb_data$data[[i]]$variable_miss) != 0) { beta <- cb_model[[i]]$beta[-cb_data$data[[i]]$variable_miss] / beta_scaling xty <- cb_data$data[[i]]$XtY[-cb_data$data[[i]]$variable_miss] - cb_model[[i]]$res[-cb_data$data[[i]]$variable_miss] <- xty - scaling_factor * xtx %*% beta + if (sum(xtx) == 1){ + cb_model[[i]]$res[-cb_data$data[[i]]$variable_miss] <- xty - scaling_factor * beta + } else { + cb_model[[i]]$res[-cb_data$data[[i]]$variable_miss] <- xty - scaling_factor * xtx %*% beta + } + } else { beta <- cb_model[[i]]$beta / beta_scaling xty <- cb_data$data[[i]]$XtY - cb_model[[i]]$res <- xty - scaling_factor * xtx %*% beta + if (sum(xtx) == 1){ + cb_model[[i]]$res <- xty - scaling_factor * beta + } else { + cb_model[[i]]$res <- xty - scaling_factor * xtx %*% beta + } } # - profile-loglikelihood yty <- cb_data$data[[i]]$YtY / scaling_factor xty <- xty / scaling_factor - profile_log <- (yty - 2 * sum(beta * xty) + sum((xtx %*% as.matrix(beta)) * beta)) * adj_dep + if (sum(xtx) == 1){ + profile_log <- (yty - 2 * sum(beta * xty) + sum(beta^2)) * adj_dep + } else { + profile_log <- (yty - 2 * sum(beta * xty) + sum((xtx %*% as.matrix(beta)) * beta)) * adj_dep + } } cb_model[[i]]$profile_loglike_each <- c(cb_model[[i]]$profile_loglike_each, profile_log) } @@ -137,10 +150,13 @@ get_LD_jk <- function(jk1, X = NULL, XtX = NULL, N = NULL, remain_idx = NULL, P }) corr[which(is.na(corr))] <- 0 } else if (!is.null(XtX)) { - # scaling <- if (!is.null(N)) N-1 else 1 jk1.remain <- which(remain_idx == jk1) corr <- rep(0, P) - corr[remain_idx] <- XtX[, jk1.remain] + if (sum(XtX) == 1 | length(jk1.remain)==0){ + corr[remain_idx] <- 1 + } else { + corr[remain_idx] <- XtX[, jk1.remain] + } } corr } diff --git a/R/colocboost_utils.R b/R/colocboost_utils.R index 702590e..c4da011 100644 --- a/R/colocboost_utils.R +++ b/R/colocboost_utils.R @@ -407,14 +407,18 @@ get_avWeigth <- function(cb_model, coloc_outcomes, update, pos.coloc, name_weigh } -get_max_profile <- function(cb_obj, check_null_max = 0.02, check_null_method = "profile") { +get_max_profile <- function(cb_obj, check_null_max = 0.025, + check_null_max_ucos = 0.015, + check_null_method = "profile") { for (i in 1:cb_obj$cb_model_para$L) { cb <- cb_obj$cb_model[[i]] scaling_factor <- if (!is.null(cb_obj$cb_data$data[[i]]$N)) cb_obj$cb_data$data[[i]]$N - 1 else 1 if (check_null_method == "profile") { cb$check_null_max <- 1000 * check_null_max / scaling_factor + cb$check_null_max_ucos <- 1000 * check_null_max_ucos / scaling_factor } else { cb$check_null_max <- check_null_max + cb$check_null_max_ucos <- check_null_max_ucos } cb_obj$cb_model[[i]] <- cb } @@ -434,26 +438,49 @@ w_cs <- function(weights, coverage = 0.95) { #' Pure R implementation (fallback) #' @noRd get_merge_ordered_with_indices <- function(vector_list) { - # Quick validation - if (!is.list(vector_list) || length(vector_list) == 0) { - stop("Input must be a non-empty list of vectors") + + # Fallback function for priority-based merging (embedded) + get_merge_ordered_with_priority_fallback <- function(vector_list) { + # Convert all vectors to character + vector_list <- lapply(vector_list, as.character) + + # Start with empty result + result <- character() + processed_elements <- character() + + # Process vectors in priority order (1st, 2nd, 3rd, etc.) + for (i in seq_along(vector_list)) { + current_vec <- vector_list[[i]] + + # Add elements from current vector that haven't been processed yet + new_elements <- setdiff(current_vec, processed_elements) + + if (length(new_elements) > 0) { + # Find positions of new elements in current vector to maintain relative order + new_positions <- which(current_vec %in% new_elements) + ordered_new_elements <- current_vec[new_positions] + + # Add to result + result <- c(result, ordered_new_elements) + processed_elements <- c(processed_elements, ordered_new_elements) + } + } + + return(result) } - + # Convert all vectors to character vector_list <- lapply(vector_list, as.character) n_vectors <- length(vector_list) - # Step 1: Get all unique elements all_elements <- unique(unlist(vector_list)) n_elements <- length(all_elements) - # Step 2: Build a graph of ordering constraints # Use an adjacency list: for each element, store elements that must come after it graph <- new.env(hash = TRUE, parent = emptyenv(), size = n_elements) for (elem in all_elements) { graph[[elem]] <- character() } - # Add edges based on consecutive pairs in each vector for (vec in vector_list) { for (i in seq_len(length(vec) - 1)) { @@ -465,7 +492,6 @@ get_merge_ordered_with_indices <- function(vector_list) { } } } - # Step 3: Compute in-degrees (number of incoming edges for each node) in_degree <- new.env(hash = TRUE, parent = emptyenv(), size = n_elements) for (elem in all_elements) { @@ -476,7 +502,6 @@ get_merge_ordered_with_indices <- function(vector_list) { in_degree[[to_elem]] <- in_degree[[to_elem]] + 1 } } - # Step 4: Topological sort using Kahn's algorithm # Start with nodes that have no incoming edges queue <- all_elements[sapply(all_elements, function(elem) in_degree[[elem]] == 0)] @@ -486,7 +511,6 @@ get_merge_ordered_with_indices <- function(vector_list) { current <- queue[1] queue <- queue[-1] result <- c(result, current) - # Process all neighbors (elements that must come after current) neighbors <- graph[[current]] for (next_elem in neighbors) { @@ -496,12 +520,14 @@ get_merge_ordered_with_indices <- function(vector_list) { } } } - - # Step 5: Check for cycles (if result doesn't include all elements, there’s a cycle) + # Step 5: Check for cycles and use fallback if needed if (length(result) != n_elements) { - stop("Cycle detected in ordering constraints; cannot produce a valid merged order") + # Different variable orders detected - use priority-based fallback + warning("Variable names in different datasets have different orders (e.g., dataset1 has A then B, but dataset2 has B then A). ", + "Using priority-based fallback to prioritize the variable order in the earlier datasets.") + result <- get_merge_ordered_with_priority_fallback(vector_list) } - + result } @@ -879,12 +905,12 @@ get_full_output <- function(cb_obj, past_out = NULL, variables = NULL, cb_output cs_change <- data.frame("ucos_outcome" = change_outcomes, "ucos_delta" = change_values) # - filter weak ucos - check_null_max <- sapply(cb_model, function(cb) cb$check_null_max) + check_null_max_ucos <- sapply(cb_model, function(cb) cb$check_null_max_ucos) remove_weak <- sapply(1:nrow(cs_change), function(ic) { outcome_tmp <- cs_change$ucos_outcome[ic] delta_tmp <- cs_change$ucos_delta[ic] pp <- which(cb_obj$cb_model_para$outcome_names == outcome_tmp) - check_tmp <- check_null_max[pp] + check_tmp <- check_null_max_ucos[pp] delta_tmp >= check_tmp }) keep_ucos <- which(remove_weak) diff --git a/man/colocboost.Rd b/man/colocboost.Rd index 41c1488..cd9fcca 100644 --- a/man/colocboost.Rd +++ b/man/colocboost.Rd @@ -59,6 +59,7 @@ colocboost( check_null = 0.1, check_null_method = "profile", check_null_max = 0.025, + check_null_max_ucos = 0.015, weaker_effect = TRUE, LD_free = FALSE, output_level = 1 @@ -182,7 +183,9 @@ to merge colocalized sets, which may resulting in a huge set.} \item{check_null_method}{The metric to check the null sets. Default is "profile"} -\item{check_null_max}{The smallest value of change of profile loglikelihood for each outcome.} +\item{check_null_max}{The smallest value of change of profile loglikelihood for each outcome in CoS.} + +\item{check_null_max_ucos}{The smallest value of change of profile loglikelihood for each outcome in uCoS.} \item{weaker_effect}{If \code{weaker_effect = TRUE}, consider the weaker single effect due to coupling effects} diff --git a/man/colocboost_plot.Rd b/man/colocboost_plot.Rd index 0fdd21c..fe155fc 100644 --- a/man/colocboost_plot.Rd +++ b/man/colocboost_plot.Rd @@ -33,7 +33,7 @@ colocboost_plot( ylim_each = TRUE, outcome_legend_pos = "top", outcome_legend_size = 1.8, - cos_legend_pos = c(0.05, 0.4), + cos_legend_pos = c(0.05, 0.4, 0.1, 0.5), show_variable = FALSE, lab_style = c(2, 1), axis_style = c(2, 1), @@ -92,7 +92,7 @@ colocboost_plot( \item{outcome_legend_size}{Size for outcome legend text, default is 1.2} -\item{cos_legend_pos}{Proportion of the legend from (left edge, bottom edge), default as (0.05, 0.4) at the left - median position} +\item{cos_legend_pos}{Proportion of the legend from (left edge, bottom edge) following by the distance for multiple labels (horizontal-x space, vertical-y space), default as (0.05, 0.4, 0.1, 0.5) at the left - median position} \item{show_variable}{Logical, if TRUE displays variant IDs, default is FALSE} diff --git a/tests/testthat/test_corner_cases.R b/tests/testthat/test_corner_cases.R index d499159..66938a5 100644 --- a/tests/testthat/test_corner_cases.R +++ b/tests/testthat/test_corner_cases.R @@ -340,7 +340,7 @@ test_that("colocboost prioritizes focal outcome correctly", { # Test with ambiguous corner cases test_that("get_ambiguous_colocalization handles edge cases with correlation thresholds", { - data(Ambiguous_Colocalization) + data("Ambiguous_Colocalization", package = "colocboost", envir = environment()) test_colocboost_results <- Ambiguous_Colocalization$ColocBoost_Results # Test with very high correlation thresholds (should find fewer ambiguities) diff --git a/tests/testthat/test_inference.R b/tests/testthat/test_inference.R index 64e74f3..0c1acd7 100644 --- a/tests/testthat/test_inference.R +++ b/tests/testthat/test_inference.R @@ -191,7 +191,7 @@ test_that("get_hierarchical_clusters functions correctly", { test_that("get_ambiguous_colocalization identifies ambiguous colocalizations correctly", { # The function expects a specialized test dataset that has ambiguous colocalizations # There's a reference in the example to a dataset named "Ambiguous_Colocalization" - data(Ambiguous_Colocalization) + data("Ambiguous_Colocalization", package = "colocboost", envir = environment()) test_colocboost_results <- Ambiguous_Colocalization$ColocBoost_Results # Basic call with default parameters @@ -256,7 +256,7 @@ test_that("get_ambiguous_colocalization identifies ambiguous colocalizations cor test_that("get_ucos_summary funtionality", { # The function expects a specialized test dataset that has ambiguous colocalizations # There's a reference in the example to a dataset named "Ambiguous_Colocalization" - data(Ambiguous_Colocalization) + data("Ambiguous_Colocalization", package = "colocboost", envir = environment()) test_colocboost_results <- Ambiguous_Colocalization$ColocBoost_Results # Basic call with default parameters diff --git a/tests/testthat/test_model.R b/tests/testthat/test_model.R index 8a8848f..df998e8 100644 --- a/tests/testthat/test_model.R +++ b/tests/testthat/test_model.R @@ -89,6 +89,172 @@ test_that("colocboost_init_data correctly initializes data", { } }) +# Test for dict_keep_variables mapping issue +test_that("colocboost correctly maps focal outcome to keep_variables with dict_keep_variables", { + + # Test Case 1: Mixed individual and summary statistics data + # 5 Y outcomes with 1 superset X + 5 sumstats with 1 superset LD + set.seed(42) + n <- 100 + p_x <- 300 # X matrix has 300 variables + p_sumstat <- 500 # 5th sumstat has 500 variables, others have 300 + + # Generate superset X matrix + X_superset <- matrix(rnorm(n * p_x), n, p_x) + colnames(X_superset) <- paste0("SNP_X_", 1:p_x) + + # Generate Y outcomes (all use the same superset X) + Y_list <- list() + for (i in 1:5) { + Y_list[[i]] <- matrix(rnorm(n), n, 1) + } + + # Generate summary statistics + # First 4 sumstats have 300 variables, 5th has 500 variables + sumstat_list <- list() + for (i in 1:4) { + sumstat_list[[i]] <- data.frame( + beta = rnorm(p_x), + sebeta = abs(rnorm(p_x, 0, 0.1)), + n = n, + variant = paste0("SNP_S_", 1:p_x) + ) + } + + # 5th sumstat has 500 variables + sumstat_list[[5]] <- data.frame( + beta = rnorm(p_sumstat), + sebeta = abs(rnorm(p_sumstat, 0, 0.1)), + n = n, + variant = paste0("SNP_S_", 1:p_sumstat) + ) + + # Create superset LD matrix for summary statistics + LD_superset <- diag(p_sumstat) # Use diagonal for simplicity + colnames(LD_superset) <- rownames(LD_superset) <- paste0("SNP_S_", 1:p_sumstat) + + # Set up dictionaries + dict_YX <- c(1, 1, 1, 1, 1) # All 5 Y outcomes use the same X (index 1) + dict_sumstatLD <- c(1, 1, 1, 1, 1) # All 5 sumstats use the same LD (index 1) + + # Expected keep_variables structure: + # Index 1: Variables from X_superset (300 variables) + # Index 2: Variables from sumstat 1 (300 variables) + # Index 3: Variables from sumstat 2 (300 variables) + # Index 4: Variables from sumstat 3 (300 variables) + # Index 5: Variables from sumstat 4 (300 variables) + # Index 6: Variables from sumstat 5 (500 variables) + keep_variables <- list( + colnames(X_superset), # Index 1: 300 variables + sumstat_list[[1]]$variant, # Index 2: 300 variables + sumstat_list[[2]]$variant, # Index 3: 300 variables + sumstat_list[[3]]$variant, # Index 4: 300 variables + sumstat_list[[4]]$variant, # Index 5: 300 variables + sumstat_list[[5]]$variant # Index 6: 500 variables + ) + + # Test the dictionary creation logic + if (!is.null(dict_YX) & !is.null(dict_sumstatLD)) { + dict <- c(dict_YX, max(dict_YX) + dict_sumstatLD) + n_ind_variable <- max(dict_YX) + # Create dict_keep_variables: first part maps to X matrices, second part maps to sumstat indices + dict_keep_variables <- c(dict_YX, 1:length(dict_sumstatLD) + n_ind_variable) + } + + # Expected results: + # dict = c(1,1,1,1,1,2,2,2,2,2) (maps to data matrices) + # dict_keep_variables = c(1,1,1,1,1,2,3,4,5,6) (maps to keep_variables indices) + + expect_equal(dict, c(1,1,1,1,1,2,2,2,2,2)) + expect_equal(dict_keep_variables, c(1,1,1,1,1,2,3,4,5,6)) + + # Test focal outcome mapping + focal_outcome_idx <- 10 # 5th sumstat (the one with 500 variables) + focal_outcome_variables <- TRUE + overlap_variables <- FALSE + + if (focal_outcome_variables & !is.null(focal_outcome_idx)) { + if (focal_outcome_idx > length(dict)) { + stop("Target outcome index is over the total number of outcomes! please check!") + } + + # Use dict_keep_variables to get the correct keep_variables index + keep_variable_names <- keep_variables[[dict_keep_variables[focal_outcome_idx]]] + + # The focal outcome (index 10) should map to keep_variables index 6 (500 variables) + expect_equal(dict_keep_variables[focal_outcome_idx], 6) + expect_equal(length(keep_variable_names), 500) + expect_equal(keep_variable_names, sumstat_list[[5]]$variant) + } + + # Test Case 2: Only summary statistics + dict_YX_null <- NULL + dict_sumstatLD_only <- c(1,1,1,1,1) + + if (is.null(dict_YX_null) & !is.null(dict_sumstatLD_only)) { + dict_only_sumstat <- dict_sumstatLD_only + n_ind_variable_only <- 0 + # Only summary statistics, so dict_keep_variables maps directly to sumstat indices + dict_keep_variables_only <- 1:length(dict_sumstatLD_only) + } + + expect_equal(dict_keep_variables_only, c(1,2,3,4,5)) + + # Test focal outcome mapping for sumstat-only case + focal_outcome_idx_only <- 5 # 5th sumstat + keep_variable_names_only <- keep_variables[[dict_keep_variables_only[focal_outcome_idx_only] + 1]] # +1 because no X matrix + + expect_equal(dict_keep_variables_only[focal_outcome_idx_only], 5) + expect_equal(length(keep_variable_names_only), 500) + + # Test Case 3: Only individual level data + dict_YX_only <- c(1,1,1,1,1) + dict_sumstatLD_null <- NULL + + if (!is.null(dict_YX_only) & is.null(dict_sumstatLD_null)) { + dict_only_ind <- dict_YX_only + # Only individual level data, so dict_keep_variables is the same as dict_YX + dict_keep_variables_only_ind <- dict_YX_only + } + + expect_equal(dict_keep_variables_only_ind, c(1,1,1,1,1)) + + # All outcomes should map to the same X matrix (index 1) + focal_outcome_idx_ind <- 3 + expect_equal(dict_keep_variables_only_ind[focal_outcome_idx_ind], 1) + + # Test Case 4: Test with actual colocboost call (if the function accepts the corrected logic) + # This would be a integration test to ensure the fix works end-to-end + + # Create a minimal working example + # Skip if colocboost doesn't have the fixed dict_keep_variables logic yet + if (exists("colocboost_init_data")) { + expect_error({ + # This should work with the corrected dict_keep_variables mapping + cb_data_test <- colocboost_init_data( + X = list(X_superset), + Y = Y_list, + dict_YX = dict_YX, + Z = lapply(sumstat_list, function(s) s$beta / s$sebeta), # z-scores + LD = list(LD_superset), + N_sumstat = lapply(sumstat_list, function(s) s$n[1]), + dict_sumstatLD = dict_sumstatLD, + Var_y = NULL, + SeBhat = NULL, + keep_variables = keep_variables, + focal_outcome_idx = 10, # Should correctly map to 500 variables + focal_outcome_variables = TRUE, + overlap_variables = FALSE + ) + + # The resulting variable names should be from the 5th sumstat (500 variables) + expect_equal(length(cb_data_test$variable.names), 500) + }, NA) + } else { + skip("colocboost_init_data not directly accessible for integration test") + } +}) + # Test colocboost_assemble function test_that("colocboost_assemble processes model results", {