diff --git a/NAMESPACE b/NAMESPACE index 00b6170..8878dc0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,6 +12,10 @@ export(get_hierarchical_clusters) export(get_robust_colocalization) export(get_robust_ucos) export(get_ucos_summary) +importFrom(Rfast,correls) +importFrom(Rfast,med) +importFrom(Rfast,standardise) +importFrom(Rfast,upper_tri) importFrom(grDevices,adjustcolor) importFrom(graphics,abline) importFrom(graphics,axis) diff --git a/R/colocboost.R b/R/colocboost.R index 3f9f860..d3e3d46 100644 --- a/R/colocboost.R +++ b/R/colocboost.R @@ -22,12 +22,20 @@ #' \code{variant} is required if sumstat for different outcomes do not have the same number of variables. #' \code{var_y} is the variance of phenotype (default is 1 meaning that the Y is in the \dQuote{standardized} scale). #' @param LD A list of correlation matrix indicating the LD matrix for each genotype. It also could be a single matrix if all sumstats were -#' obtained from the same genotypes. +#' obtained from the same genotypes. Provide either \code{LD} or \code{X_ref}, not both. +#' If neither is provided, LD-free mode is used. +#' @param X_ref A reference panel genotype matrix (N_ref x P) or a list of matrices, as an alternative to providing a precomputed +#' \code{LD} matrix. Column names must include variant names matching those in \code{sumstat}. +#' When the number of reference panel samples is less than the number of variants (N_ref < P), +#' this avoids storing the full P x P LD matrix and reduces memory usage. +#' When N_ref >= P, LD is precomputed from \code{X_ref} internally. +#' Provide either \code{LD} or \code{X_ref}, not both. If neither is provided, LD-free mode is used. #' @param dict_YX A L by 2 matrix of dictionary for \code{X} and \code{Y} if there exist subsets of outcomes corresponding to the same X matrix. #' The first column should be 1:L for L outcomes. The second column should be the index of \code{X} corresponding to the outcome. #' The innovation: do not provide the same matrix in \code{X} to reduce the computational burden. -#' @param dict_sumstatLD A L by 2 matrix of dictionary for \code{sumstat} and \code{LD} if there exist subsets of outcomes corresponding to the same sumstat. -#' The first column should be 1:L for L sumstat The second column should be the index of \code{LD} corresponding to the sumstat. +#' @param dict_sumstatLD A L by 2 matrix of dictionary for \code{sumstat} and \code{LD} (or \code{X_ref}) if there exist subsets of outcomes +#' corresponding to the same sumstat. +#' The first column should be 1:L for L sumstat The second column should be the index of \code{LD} (or \code{X_ref}) corresponding to the sumstat. #' The innovation: do not provide the same matrix in \code{LD} to reduce the computational burden. #' @param outcome_names The names of outcomes, which has the same order for Y. #' @param focal_outcome_idx The index of the focal outcome if perform GWAS-xQTL ColocBoost @@ -129,9 +137,10 @@ #' #' @family colocboost #' @importFrom stats na.omit +#' @importFrom Rfast correls standardise upper_tri med #' @export colocboost <- function(X = NULL, Y = NULL, # individual data - sumstat = NULL, LD = NULL, # summary statistics: either Z, bhat, sebhat, N, var_Y, + sumstat = NULL, LD = NULL, X_ref = NULL, # summary statistics: either Z, bhat, sebhat, N, var_Y, ###### - index dict for X match multiple Y / LD match multiple sumstat dict_YX = NULL, # Y index for 1st column, X index for 2nd column dict_sumstatLD = NULL, # sumstat index for 1st column, LD index for 2nd column @@ -202,11 +211,15 @@ colocboost <- function(X = NULL, Y = NULL, # individual data warning("Error: No individual data (X, Y) or summary statistics (sumstat) or (effect_est, effect_se) are provided! Please check!") return(NULL) } + if (!is.null(LD) && !is.null(X_ref)) { + warning("Error: Provide either LD or X_ref, not both.") + return(NULL) + } # - check input data: individual level data and summary-level data validated_data <- colocboost_validate_input_data( X = X, Y = Y, - sumstat = sumstat, LD = LD, + sumstat = sumstat, LD = LD, X_ref = X_ref, dict_YX = dict_YX, dict_sumstatLD = dict_sumstatLD, effect_est = effect_est, effect_se = effect_se, effect_n = effect_n, overlap_variables = overlap_variables, @@ -227,6 +240,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data keep_variable_individual <- validated_data$keep_variable_individual sumstat <- validated_data$sumstat LD <- validated_data$LD + X_ref <- validated_data$X_ref sumstatLD_dict <- validated_data$sumstatLD_dict keep_variable_sumstat <- validated_data$keep_variable_sumstat Z <- validated_data$Z @@ -277,7 +291,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data } cb_data <- colocboost_init_data( X = X, Y = Y, dict_YX = yx_dict, - Z = Z, LD = LD, N_sumstat = N_sumstat, dict_sumstatLD = sumstatLD_dict, + Z = Z, LD = LD, X_ref = X_ref, N_sumstat = N_sumstat, dict_sumstatLD = sumstatLD_dict, Var_y = Var_y, SeBhat = SeBhat, keep_variables = keep_variables, focal_outcome_idx = focal_outcome_idx, @@ -408,7 +422,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data #' #' @keywords internal colocboost_validate_input_data <- function(X = NULL, Y = NULL, - sumstat = NULL, LD = NULL, + sumstat = NULL, LD = NULL, X_ref = NULL, dict_YX = NULL, dict_sumstatLD = NULL, effect_est = NULL, effect_se = NULL, effect_n = NULL, overlap_variables = FALSE, @@ -659,10 +673,36 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL, cos_npc_cutoff_updated <- cos_npc_cutoff npc_outcome_cutoff_updated <- npc_outcome_cutoff - if (is.null(LD)) { - # if no LD input, set diagonal matrix to LD + # --- Handle X_ref: convert to LD when N_ref >= P, or keep for on-the-fly computation + if (!is.null(X_ref)) { + if (is.data.frame(X_ref)) X_ref <- as.matrix(X_ref) + if (is.matrix(X_ref)) X_ref <- list(X_ref) + N_ref <- nrow(X_ref[[1]]) + P_ref <- ncol(X_ref[[1]]) + if (N_ref >= P_ref) { + # Precompute LD from X_ref (more compact when N_ref >= P) + message("N_ref >= P: precomputing LD from X_ref.") + LD <- lapply(X_ref, get_cormat) + for (idx in seq_along(LD)) { + rownames(LD[[idx]]) <- colnames(LD[[idx]]) <- colnames(X_ref[[idx]]) + } + X_ref <- NULL + } else { + # N_ref < P: keep X_ref for on-the-fly computation + # Set rownames/colnames on X_ref for variant matching (use colnames as LD would) + # Build LD-like variant mapping using X_ref colnames + # Standardize X_ref + for (idx in seq_along(X_ref)) { + X_ref[[idx]] <- standardise(X_ref[[idx]], center = TRUE, scale = TRUE) + X_ref[[idx]][which(is.na(X_ref[[idx]]))] <- 0 + } + } + } + + if (is.null(LD) && is.null(X_ref)) { + # if no LD or X_ref input, set diagonal matrix to LD warning( - "Providing the LD for summary statistics data is highly recommended. ", + "Providing the LD or X_ref for summary statistics data is highly recommended. ", "Without LD, only a single iteration will be performed under the assumption of one causal variable per outcome. ", "Additionally, the purity of CoS cannot be evaluated!" ) @@ -679,59 +719,72 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL, npc_outcome_cutoff_updated <- 0 } else { - - if (is.data.frame(LD)) LD <- as.matrix(LD) - if (is.matrix(LD)) LD <- list(LD) - # - check if NA in LD matrix - num_na <- sapply(LD, sum) - if (any(is.na(num_na))){ - warning("Error: Input LD must not contain missing values (NA).") - return(NULL) + # --- Determine reference list and variant extraction for LD or X_ref + if (!is.null(LD)) { + if (is.data.frame(LD)) LD <- as.matrix(LD) + if (is.matrix(LD)) LD <- list(LD) + # - check if NA in LD matrix + num_na <- sapply(LD, sum) + if (any(is.na(num_na))){ + warning("Error: Input LD must not contain missing values (NA).") + return(NULL) + } + ref_list <- LD + get_ref_variants <- function(ref_mat) rownames(ref_mat) + ref_label <- "LD" + ref_ncol <- function(ref_mat) ncol(ref_mat) + } else { + # X_ref path (N_ref < P, already standardized above) + ref_list <- X_ref + get_ref_variants <- function(ref_mat) colnames(ref_mat) + ref_label <- "X_ref" + ref_ncol <- function(ref_mat) ncol(ref_mat) } - # Create sumstat-LD mapping === - if (length(LD) == 1) { + + # Create sumstat-LD/X_ref mapping === + if (length(ref_list) == 1) { sumstatLD_dict <- rep(1, length(sumstat)) - } else if (length(LD) == length(sumstat)) { + } else if (length(ref_list) == length(sumstat)) { sumstatLD_dict <- seq_along(sumstat) } else { if (is.null(dict_sumstatLD)) { - warning('Error: Please provide dict_sumstatLD: you have ', length(sumstat), - ' sumstats but only ', length(LD), ' LD matrices') + warning('Error: Please provide dict_sumstatLD: you have ', length(sumstat), + ' sumstats but only ', length(ref_list), ' ', ref_label, ' matrices') return(NULL) } else { - # - dict for sumstat to LD mapping + # - dict for sumstat to LD/X_ref mapping sumstatLD_dict <- rep(NA, length(sumstat)) for (i in 1:length(sumstat)) { tmp <- unique(dict_sumstatLD[dict_sumstatLD[, 1] == i, 2]) if (length(tmp) == 0) { - warning(paste("Error: You don't provide matched LD for sumstat", i)) + warning(paste("Error: You don't provide matched", ref_label, "for sumstat", i)) return(NULL) } else if (length(tmp) != 1) { - warning(paste("Error: You provide multiple matched LD for sumstat", i)) + warning(paste("Error: You provide multiple matched", ref_label, "for sumstat", i)) return(NULL) } else { sumstatLD_dict[i] <- tmp } } - if (max(sumstatLD_dict) > length(LD)) { - warning("Error: You don't provide enough LD matrices!") + if (max(sumstatLD_dict) > length(ref_list)) { + warning(paste("Error: You don't provide enough", ref_label, "matrices!")) return(NULL) } } } - + # === Filter variants for each sumstat === for (i in seq_along(sumstat)) { # Get sumstat variants (adjust field name based on your data structure) sumstat_variants <- sumstat[[i]]$variant n_total <- length(sumstat_variants) - # Get LD variants + # Get LD/X_ref variants ld_idx <- sumstatLD_dict[i] - current_ld <- LD[[ld_idx]] - ld_variants <- rownames(current_ld) + current_ref <- ref_list[[ld_idx]] + ld_variants <- get_ref_variants(current_ref) if (is.null(ld_variants)) { - if (ncol(current_ld) != n_total){ - warning('Error: LD matrix ', ld_idx, ' has no rownames. Please ensure all LD matrices have variant names as rownames.') + if (ref_ncol(current_ref) != n_total){ + warning('Error: ', ref_label, ' matrix ', ld_idx, ' has no variant names. Please ensure all ', ref_label, ' matrices have variant names.') return(NULL) } } @@ -834,7 +887,7 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL, Z[[i.summstat]] <- z } } else { - Z <- N_sumstat <- Var_y <- SeBhat <- sumstatLD_dict <- keep_variable_sumstat <- NULL + Z <- N_sumstat <- Var_y <- SeBhat <- sumstatLD_dict <- keep_variable_sumstat <- X_ref <- NULL M_updated <- M min_abs_corr_updated <- min_abs_corr jk_equiv_corr_updated <- jk_equiv_corr @@ -851,6 +904,7 @@ colocboost_validate_input_data <- function(X = NULL, Y = NULL, keep_variable_individual = keep_variable_individual, sumstat = sumstat, LD = LD, + X_ref = X_ref, sumstatLD_dict = sumstatLD_dict, keep_variable_sumstat = keep_variable_sumstat, Z = Z, diff --git a/R/colocboost_assemble.R b/R/colocboost_assemble.R index 94cbbe5..734150d 100644 --- a/R/colocboost_assemble.R +++ b/R/colocboost_assemble.R @@ -132,10 +132,13 @@ colocboost_assemble <- function(cb_obj, } } if (!is.null(cb_obj_single$cb_data$data[[1]][["XtY"]])) { + X_dict <- cb_obj$cb_data$dict[i] if (is.null(cb_obj_single$cb_data$data[[1]]$XtX)) { - X_dict <- cb_obj$cb_data$dict[i] cb_obj_single$cb_data$data[[1]]$XtX <- cb_obj$cb_data$data[[X_dict]]$XtX } + if (is.null(cb_obj_single$cb_data$data[[1]]$X_ref)) { + cb_obj_single$cb_data$data[[1]]$X_ref <- cb_obj$cb_data$data[[X_dict]]$X_ref + } } class(cb_obj_single) <- "colocboost" out_ucos_each <- colocboost_assemble_ucos(cb_obj_single, diff --git a/R/colocboost_assemble_cos.R b/R/colocboost_assemble_cos.R index 6a1243a..4958754 100644 --- a/R/colocboost_assemble_cos.R +++ b/R/colocboost_assemble_cos.R @@ -40,7 +40,7 @@ colocboost_assemble_cos <- function(cb_obj, for (iiii in 1:length(coloc_outcomes)) { X_dict <- cb_data$dict[coloc_outcomes[iiii]] tmp <- w_purity(avWeight[, iiii, drop = FALSE], - X = cb_data$data[[X_dict]]$X, Xcorr = cb_data$data[[X_dict]]$XtX, + X = get_genotype_matrix(cb_data$data[[X_dict]]), Xcorr = cb_data$data[[X_dict]]$XtX, N = cb_data$data[[coloc_outcomes[iiii]]]$N, n = n_purity, coverage = sec_coverage_thresh, min_abs_corr = min_abs_corr, median_abs_corr = median_abs_corr, miss_idx = cb_data$data[[coloc_outcomes[iiii]]]$variable_miss @@ -81,7 +81,7 @@ colocboost_assemble_cos <- function(cb_obj, pos <- match(pos, setdiff(1:cb_model_para$P, cb_data$data[[i]]$variable_miss)) } p_tmp <- matrix(get_purity(pos, - X = cb_data$data[[X_dict]]$X, + X = get_genotype_matrix(cb_data$data[[X_dict]]), Xcorr = cb_data$data[[X_dict]]$XtX, N = cb_data$data[[i]]$N, n = n_purity ), 1, 3) @@ -145,7 +145,7 @@ colocboost_assemble_cos <- function(cb_obj, for (iiii in 1:length(coloc_outcomes)) { X_dict <- cb_data$dict[coloc_outcomes[iiii]] tmp <- w_purity(avWeight[, iiii, drop = FALSE], - X = cb_data$data[[X_dict]]$X, Xcorr = cb_data$data[[X_dict]]$XtX, + X = get_genotype_matrix(cb_data$data[[X_dict]]), Xcorr = cb_data$data[[X_dict]]$XtX, N = cb_data$data[[coloc_outcomes[iiii]]]$N, n = n_purity, coverage = sec_coverage_thresh, min_abs_corr = min_abs_corr, median_abs_corr = median_abs_corr, miss_idx = cb_data$data[[coloc_outcomes[iiii]]]$variable_miss @@ -205,7 +205,7 @@ colocboost_assemble_cos <- function(cb_obj, weight_cluster <- t(av[[iiii]][idx, , drop = FALSE]) X_dict <- cb_data$dict[coloc_outcomes[iiii]] tmp <- w_purity(weight_cluster, - X = cb_data$data[[X_dict]]$X, Xcorr = cb_data$data[[X_dict]]$XtX, + X = get_genotype_matrix(cb_data$data[[X_dict]]), Xcorr = cb_data$data[[X_dict]]$XtX, N = cb_data$data[[coloc_outcomes[iiii]]]$N, n = n_purity, coverage = sec_coverage_thresh, min_abs_corr = min_abs_corr, median_abs_corr = median_abs_corr, miss_idx = cb_data$data[[coloc_outcomes[iiii]]]$variable_miss @@ -348,7 +348,7 @@ colocboost_assemble_cos <- function(cb_obj, for (i in 1:cb_model_para$L) { X_dict <- cb_data$dict[i] res[[i]] <- get_between_purity(cset1, cset2, - X = cb_data$data[[X_dict]]$X, + X = get_genotype_matrix(cb_data$data[[X_dict]]), Xcorr = cb_data$data[[X_dict]]$XtX, miss_idx = cb_data$data[[i]]$variable_miss, P = cb_model_para$P @@ -436,7 +436,7 @@ colocboost_assemble_cos <- function(cb_obj, pos <- match(pos, setdiff(1:cb_model_para$P, cb_data$data[[i3]]$variable_miss)) } tmp <- matrix(get_purity(pos, - X = cb_data$data[[X_dict]]$X, + X = get_genotype_matrix(cb_data$data[[X_dict]]), Xcorr = cb_data$data[[X_dict]]$XtX, N = cb_data$data[[i3]]$N, n = n_purity ), 1, 3) diff --git a/R/colocboost_assemble_ucos.R b/R/colocboost_assemble_ucos.R index 3336f19..5fa769a 100644 --- a/R/colocboost_assemble_ucos.R +++ b/R/colocboost_assemble_ucos.R @@ -63,7 +63,7 @@ colocboost_assemble_ucos <- function(cb_obj_single, pos <- match(pos, setdiff(1:cb_model_para$P, cb_data$data[[1]]$variable_miss)) } purity <- matrix(get_purity(pos, - X = cb_data$data[[1]]$X, Xcorr = cb_data$data[[1]]$XtX, + X = get_genotype_matrix(cb_data$data[[1]]), Xcorr = cb_data$data[[1]]$XtX, N = cb_data$data[[1]]$N, n = n_purity ), 1, 3) purity <- as.data.frame(purity) @@ -128,7 +128,7 @@ colocboost_assemble_ucos <- function(cb_obj_single, w <- LogLik_change[idx] weight_cluster <- t(weights[idx, , drop = FALSE]) check_purity <- w_purity(weight_cluster, - X = cb_data$data[[1]]$X, Xcorr = cb_data$data[[1]]$XtX, + X = get_genotype_matrix(cb_data$data[[1]]), Xcorr = cb_data$data[[1]]$XtX, N = cb_data$data[[1]]$N, n = n_purity, coverage = coverage, min_abs_corr = min_abs_corr, median_abs_corr = median_abs_corr, miss_idx = cb_data$data[[1]]$variable_miss @@ -234,7 +234,7 @@ colocboost_assemble_ucos <- function(cb_obj_single, cset1 <- confidence_sets[[i.between]] cset2 <- confidence_sets[[j.between]] res <- get_between_purity(cset1, cset2, - X = cb_data$data[[1]]$X, + X = get_genotype_matrix(cb_data$data[[1]]), Xcorr = cb_data$data[[1]]$XtX, miss_idx = cb_data$data[[1]]$variable_miss, P = cb_model_para$P @@ -300,7 +300,7 @@ colocboost_assemble_ucos <- function(cb_obj_single, rbind( purity, matrix(get_purity(pos, - X = cb_data$data[[1]]$X, Xcorr = cb_data$data[[1]]$XtX, + X = get_genotype_matrix(cb_data$data[[1]]), Xcorr = cb_data$data[[1]]$XtX, N = cb_data$data[[1]]$N, n = n_purity ), 1, 3) ) diff --git a/R/colocboost_check_update_jk.R b/R/colocboost_check_update_jk.R index d449b03..58c261b 100644 --- a/R/colocboost_check_update_jk.R +++ b/R/colocboost_check_update_jk.R @@ -119,6 +119,7 @@ boost_check_update_jk_nofocal <- function(cb_model, cb_model_para, cb_data) { res = model_update[[ii]]$res, N = data_update[[ii]]$N, XtX = cb_data$data[[X_dict[ii]]]$XtX, + X_ref = cb_data$data[[X_dict[ii]]]$X_ref, YtY = data_update[[ii]]$YtY, XtY = data_update[[ii]]$XtY, beta_k = model_update[[ii]]$beta, @@ -194,6 +195,7 @@ boost_check_update_jk_nofocal <- function(cb_model, cb_model_para, cb_data) { res = model_update[[ii]]$res, N = data_update[[ii]]$N, XtX = cb_data$data[[X_dict[ii]]]$XtX, + X_ref = cb_data$data[[X_dict[ii]]]$X_ref, YtY = data_update[[ii]]$YtY, XtY = data_update[[ii]]$XtY, beta_k = model_update[[ii]]$beta, @@ -238,6 +240,7 @@ boost_check_update_jk_nofocal <- function(cb_model, cb_model_para, cb_data) { res = model_update[[ii]]$res, N = data_update[[ii]]$N, XtX = cb_data$data[[X_dict[ii]]]$XtX, + X_ref = cb_data$data[[X_dict[ii]]]$X_ref, YtY = data_update[[ii]]$YtY, XtY = data_update[[ii]]$XtY, beta_k = model_update[[ii]]$beta, @@ -365,7 +368,7 @@ boost_check_update_jk_focal <- function(cb_model, cb_model_para, cb_data, # ---- first, check LD between jk_focal and jk_each based on focal LD ld <- sapply(jk_each[-pp_focal], function(jki) { get_LD_jk1_jk2(jk_focal, jki, - X = cb_data$data[[X_dict[pp_focal]]]$X, + X = get_genotype_matrix(cb_data$data[[X_dict[pp_focal]]]), XtX = cb_data$data[[X_dict[pp_focal]]]$XtX, N = cb_data$data[[X_dict[pp_focal]]]$N, remain_jk = 1:cb_model_para$P @@ -378,6 +381,7 @@ boost_check_update_jk_focal <- function(cb_model, cb_model_para, cb_data, jk_focal_tmp <- setdiff(order_cor, variable_missing)[1] # ----- third, if picked variant within the same LD buddies ld_tmp <- get_LD_jk1_jk2(jk_focal, jk_focal_tmp, + X = get_genotype_matrix(cb_data$data[[X_dict[pp_focal]]]), XtX = cb_data$data[[X_dict[pp_focal]]]$XtX, remain_jk = 1:cb_model_para$P ) @@ -525,7 +529,7 @@ check_jk_jkeach <- function(jk, jk_each, change_log_jkeach <- model_update[[i]]$change_loglike[jk_each[i]] change_each <- abs(change_log_jk - change_log_jkeach) LD_temp <- get_LD_jk1_jk2(jk, jk_each[i], - X = cb_data$data[[X_dict[i]]]$X, + X = get_genotype_matrix(cb_data$data[[X_dict[i]]]), XtX = cb_data$data[[X_dict[i]]]$XtX, N = data_update[[i]]$N, remain_jk = setdiff(1:length(model_update[[i]]$res), data_update[[i]]$variable_miss) @@ -579,7 +583,7 @@ check_pair_jkeach <- function(jk_each, data_update <- cb_data$data[pos.update] LD_all <- lapply(1:length(jk_each), function(idx){ get_LD_jk_each(jk_each, - X = cb_data$data[[X_dict[idx]]]$X, + X = get_genotype_matrix(cb_data$data[[X_dict[idx]]]), XtX = cb_data$data[[X_dict[idx]]]$XtX, N = data_update[[idx]]$N, remain_jk = setdiff(1:length(model_update[[idx]]$res), data_update[[idx]]$variable_miss) @@ -610,7 +614,8 @@ check_pair_jkeach <- function(jk_each, estimate_change_profile_res <- function(jk, X = NULL, res = NULL, N = NULL, - XtX = NULL, YtY = NULL, XtY = NULL, + XtX = NULL, X_ref = NULL, + YtY = NULL, XtY = NULL, beta_k = NULL, miss_idx = NULL) { if (!is.null(X)) { @@ -629,11 +634,8 @@ estimate_change_profile_res <- function(jk, } else { xty <- XtY / scaling_factor } - if (length(xtx) == 1){ - rtr <- yty - 2 * sum(beta_k * xty) + sum(beta_k^2) - } else { - rtr <- yty - 2 * sum(beta_k * xty) + sum((xtx %*% as.matrix(beta_k)) * beta_k) - } + XtX_beta_k <- compute_xtx_product(beta_k, XtX = xtx, X_ref = X_ref) + rtr <- yty - 2 * sum(beta_k * xty) + sum(XtX_beta_k * beta_k) } numerator <- xtr^2 / (2 * rtr) denominator <- 0.5 * log(2 * pi * rtr) + 0.5 diff --git a/R/colocboost_inference.R b/R/colocboost_inference.R index ff9f312..73d8a74 100644 --- a/R/colocboost_inference.R +++ b/R/colocboost_inference.R @@ -227,7 +227,7 @@ check_null_post <- function(cb_obj, } get_profile <- function(cs_beta, X = NULL, Y = NULL, N = NULL, - XtX = NULL, YtY = NULL, XtY = NULL, miss_idx, adj_dep = 1) { + XtX = NULL, YtY = NULL, XtY = NULL, miss_idx, adj_dep = 1, X_ref = NULL) { if (!is.null(X)) { mean((Y - X %*% as.matrix(cs_beta))^2) * N / (N - 1) } else if (!is.null(XtY)) { @@ -242,11 +242,8 @@ check_null_post <- function(cb_obj, } else { xty <- XtY / scaling_factor } - if (length(xtx) == 1){ - (yty - 2 * sum(cs_beta * xty) + sum(cs_beta^2)) * adj_dep - } else { - (yty - 2 * sum(cs_beta * xty) + sum((xtx %*% as.matrix(cs_beta)) * cs_beta)) * adj_dep - } + XtX_beta <- compute_xtx_product(cs_beta, XtX = xtx, X_ref = X_ref) + (yty - 2 * sum(cs_beta * xty) + sum(XtX_beta * cs_beta)) * adj_dep } } @@ -284,7 +281,7 @@ check_null_post <- function(cb_obj, return(tau * matrixStats::logSumExp(exp_term / tau + log(delta))) } - update_res <- function(X = NULL, Y = NULL, XtX = NULL, XtY = NULL, N = NULL, cs_beta, miss_idx) { + update_res <- function(X = NULL, Y = NULL, XtX = NULL, XtY = NULL, N = NULL, cs_beta, miss_idx, X_ref = NULL) { if (!is.null(X)) { return(Y - X %*% cs_beta) } else if (!is.null(XtX)) { @@ -294,18 +291,10 @@ check_null_post <- function(cb_obj, if (length(miss_idx) != 0) { xty <- XtY[-miss_idx] / scaling.factor res.tmp <- rep(0, length(XtY)) - if (length(xtx) == 1){ - res.tmp[-miss_idx] <- xty - cs_beta[-miss_idx] / beta_scaling - } else { - res.tmp[-miss_idx] <- xty - xtx %*% (cs_beta[-miss_idx] / beta_scaling) - } + res.tmp[-miss_idx] <- xty - compute_xtx_product(cs_beta[-miss_idx] / beta_scaling, XtX = xtx, X_ref = X_ref) } else { xty <- XtY / scaling.factor - if (length(xtx) == 1){ - res.tmp <- xty - (cs_beta / beta_scaling) - } else { - res.tmp <- xty - xtx %*% (cs_beta / beta_scaling) - } + res.tmp <- xty - compute_xtx_product(cs_beta / beta_scaling, XtX = xtx, X_ref = X_ref) } return(res.tmp) } @@ -330,7 +319,8 @@ check_null_post <- function(cb_obj, X = cb_data$data[[X_dict]]$X, Y = cb_data$data[[j]]$Y, XtX = cb_data$data[[X_dict]]$XtX, XtY = cb_data$data[[j]]$XtY, YtY = cb_data$data[[j]]$YtY, N = cb_data$data[[j]]$N, - miss_idx = cb_data$data[[j]]$variable_miss, adj_dep = adj_dep + miss_idx = cb_data$data[[j]]$variable_miss, adj_dep = adj_dep, + X_ref = cb_data$data[[X_dict]]$X_ref ) last_profile <- extract_last(cb_obj$cb_model[[j]]$profile_loglike_each) change <- abs(cs_profile - last_profile) @@ -348,7 +338,8 @@ check_null_post <- function(cb_obj, X = cb_data$data[[X_dict]]$X, Y = cb_data$data[[j]]$Y, XtX = cb_data$data[[X_dict]]$XtX, XtY = cb_data$data[[j]]$XtY, N = cb_data$data[[j]]$N, cs_beta, - miss_idx = cb_data$data[[j]]$variable_miss + miss_idx = cb_data$data[[j]]$variable_miss, + X_ref = cb_data$data[[X_dict]]$X_ref ) cs_obj <- get_cs_obj(cs_beta, res, cb_obj$cb_model_para$tau, cb_obj$cb_model_para$func_simplex, cb_obj$cb_model_para$lambda, @@ -402,8 +393,6 @@ check_null_post <- function(cb_obj, #' @noRd #' @importFrom stats na.omit get_purity <- function(pos, X = NULL, Xcorr = NULL, N = NULL, n = 100) { - get_upper_tri <- Rfast::upper_tri - get_median <- Rfast::med if (sum(is.na(pos)) != 0) { pos <- as.numeric(na.omit(pos)) @@ -424,19 +413,19 @@ get_purity <- function(pos, X = NULL, Xcorr = NULL, N = NULL, n = 100) { get_cormat(X_sub) }) corr[which(is.na(corr))] <- 0 - value <- abs(get_upper_tri(corr)) + value <- abs(upper_tri(corr)) } else { if (length(Xcorr) == 1){ value <- 0 } else { Xcorr <- Xcorr # if (!is.null(N)) Xcorr/(N-1) else Xcorr - value <- abs(get_upper_tri(Xcorr[pos, pos])) + value <- abs(upper_tri(Xcorr[pos, pos])) } } return(c( min(value), sum(value) / length(value), - get_median(value) + med(value) )) } } @@ -458,7 +447,6 @@ get_between_purity <- function(pos1, pos2, X = NULL, Xcorr = NULL, miss_idx = NU cr <- tcrossprod(X_sub1, X_sub2) return(cr) } - get_median <- Rfast::med if (is.null(Xcorr)) { X_sub1 <- scale(X[, pos1, drop = FALSE], center = T, scale = F) @@ -479,7 +467,7 @@ get_between_purity <- function(pos1, pos2, X = NULL, Xcorr = NULL, miss_idx = NU } } } - return(c(min(value), max(value), get_median(value))) + return(c(min(value), max(value), med(value))) } #' Function to get the evidence of colocalization @@ -498,7 +486,7 @@ get_cos_evidence <- function(cb_obj, coloc_out, data_info) { } get_cos_profile <- function(cs_beta, outcome_idx, X = NULL, Y = NULL, N = NULL, - XtX = NULL, YtY = NULL, XtY = NULL, miss_idx = NULL, adj_dep = 1) { + XtX = NULL, YtY = NULL, XtY = NULL, miss_idx = NULL, adj_dep = 1, X_ref = NULL) { if (!is.null(X)) { cos_profile <- mean((Y - X %*% as.matrix(cs_beta))^2) * N / (N - 1) yty <- var(Y) @@ -514,11 +502,8 @@ get_cos_evidence <- function(cb_obj, coloc_out, data_info) { } else { xty <- XtY / scaling_factor } - if (length(xtx) == 1){ - cos_profile <- (yty - 2 * sum(cs_beta * xty) + sum(cs_beta^2)) * adj_dep - } else { - cos_profile <- (yty - 2 * sum(cs_beta * xty) + sum((xtx %*% as.matrix(cs_beta)) * cs_beta)) * adj_dep - } + XtX_beta <- compute_xtx_product(cs_beta, XtX = xtx, X_ref = X_ref) + cos_profile <- (yty - 2 * sum(cs_beta * xty) + sum(XtX_beta * cs_beta)) * adj_dep } delta <- yty - cos_profile if (delta <= 0) { @@ -543,7 +528,8 @@ get_cos_evidence <- function(cb_obj, coloc_out, data_info) { XtX = cb_data$data[[X_dict]]$XtX, XtY = cb_data$data[[outcome_idx]]$XtY, YtY = cb_data$data[[outcome_idx]]$YtY, N = cb_data$data[[outcome_idx]]$N, miss_idx = cb_data$data[[outcome_idx]]$variable_miss, - adj_dep = cb_data$data[[outcome_idx]]$dependency + adj_dep = cb_data$data[[outcome_idx]]$dependency, + X_ref = cb_data$data[[X_dict]]$X_ref ) max_profile <- max(cb_obj$cb_model[[outcome_idx]]$profile_loglike_each) ifelse(max_profile < cos_profile, 0, max_profile - cos_profile) @@ -612,7 +598,7 @@ get_cos_evidence <- function(cb_obj, coloc_out, data_info) { get_ucos_evidence <- function(cb_obj, ucoloc_info) { get_ucos_profile <- function(cs_beta, outcome_idx, X = NULL, Y = NULL, N = NULL, - XtX = NULL, YtY = NULL, XtY = NULL, miss_idx = NULL, adj_dep = 1) { + XtX = NULL, YtY = NULL, XtY = NULL, miss_idx = NULL, adj_dep = 1, X_ref = NULL) { if (!is.null(X)) { cos_profile <- mean((Y - X %*% as.matrix(cs_beta))^2) * N / (N - 1) yty <- var(Y) @@ -628,11 +614,8 @@ get_ucos_evidence <- function(cb_obj, ucoloc_info) { } else { xty <- XtY / scaling_factor } - if (length(xtx) == 1){ - cos_profile <- (yty - 2 * sum(cs_beta * xty) + sum(cs_beta^2)) * adj_dep - } else { - cos_profile <- (yty - 2 * sum(cs_beta * xty) + sum((xtx %*% as.matrix(cs_beta)) * cs_beta)) * adj_dep - } + XtX_beta <- compute_xtx_product(cs_beta, XtX = xtx, X_ref = X_ref) + cos_profile <- (yty - 2 * sum(cs_beta * xty) + sum(XtX_beta * cs_beta)) * adj_dep } delta <- yty - cos_profile if (delta <= 0) { @@ -657,7 +640,8 @@ get_ucos_evidence <- function(cb_obj, ucoloc_info) { XtX = cb_data$data[[X_dict]]$XtX, XtY = cb_data$data[[outcome_idx]]$XtY, YtY = cb_data$data[[outcome_idx]]$YtY, N = cb_data$data[[outcome_idx]]$N, miss_idx = cb_data$data[[outcome_idx]]$variable_miss, - adj_dep = cb_data$data[[outcome_idx]]$dependency + adj_dep = cb_data$data[[outcome_idx]]$dependency, + X_ref = cb_data$data[[X_dict]]$X_ref ) max_profile <- max(cb_obj$cb_model[[outcome_idx]]$profile_loglike_each) ifelse(max_profile < cos_profile, 0, max_profile - cos_profile) diff --git a/R/colocboost_init.R b/R/colocboost_init.R index edca937..62e11bc 100644 --- a/R/colocboost_init.R +++ b/R/colocboost_init.R @@ -28,7 +28,7 @@ colocboost_inits <- function() { #' @noRd #' @keywords cb_objects colocboost_init_data <- function(X, Y, dict_YX, - Z, LD, N_sumstat, dict_sumstatLD, + Z, LD, X_ref = NULL, N_sumstat, dict_sumstatLD, Var_y, SeBhat, keep_variables, focal_outcome_idx = NULL, @@ -105,14 +105,15 @@ colocboost_init_data <- function(X, Y, dict_YX, } n_ind <- flag - 1 # if summary: XtX XtY, YtY - if (!is.null(Z) & !is.null(LD)) { + if (!is.null(Z) & (!is.null(LD) | !is.null(X_ref))) { ####################### need to consider more ######################### # ------ only code up one sumstat variant_lists <- keep_variables[c((n_ind_variable+1):length(keep_variables))] sumstat_formated <- process_sumstat( Z, N_sumstat, Var_y, SeBhat, LD, variant_lists, dict_sumstatLD, - keep_variable_names + keep_variable_names, + X_ref = X_ref ) for (i in 1:length(Z)) { cb_data$data[[flag]] <- sumstat_formated$results[[i]] @@ -174,7 +175,9 @@ colocboost_init_model <- function(cb_data, "learning_rate_init" = learning_rate_init, "stop_thresh" = stop_thresh, "ld_jk" = c(), - "jk" = c() + "jk" = c(), + "scaling_factor" = if (!is.null(cb_data$data[[i]]$N)) (cb_data$data[[i]]$N - 1) else 1, + "beta_scaling" = if (!is.null(cb_data$data[[i]]$N)) 1 else 100 ) data_each <- cb_data$data[[i]] @@ -364,6 +367,51 @@ estimate_change_profile <- function(X = NULL, Y = NULL, N = NULL, return(change_loglike) } +#' Get genotype matrix from a data entry for LD/correlation computation. +#' +#' Each data entry is either individual-level (has \code{$X}) or summary statistics +#' (has \code{$XtX} and/or \code{$X_ref}). These are mutually exclusive: +#' \code{$X} and \code{$X_ref} never coexist on the same entry. +#' +#' This helper is used where a genotype matrix is needed to compute correlations +#' (e.g., purity checks via \code{get_cormat}). For summary stats entries that have +#' \code{$XtX} (precomputed LD), callers typically use \code{Xcorr = $XtX} directly +#' and this function returns NULL — the Xcorr path takes priority in \code{get_purity}. +#' For summary stats entries with \code{$X_ref} (reference panel, no precomputed LD), +#' this returns the reference panel so correlations can be computed on the fly. +#' +#' @param data_entry A single element of \code{cb_data$data} +#' @return A genotype matrix (individual-level X or reference panel X_ref), or NULL +#' @noRd +get_genotype_matrix <- function(data_entry) { + if (!is.null(data_entry$X)) return(data_entry$X) + if (!is.null(data_entry$X_ref)) return(data_entry$X_ref) + return(NULL) +} + +#' Compute the product XtX \%*\% v from whichever LD source is available. +#' +#' Dispatches to the appropriate computation: +#' \itemize{ +#' \item \code{XtX} matrix (P x P): direct matrix-vector multiply, O(P^2) +#' \item \code{X_ref} reference panel (N_ref x P): two matrix-vector multiplies, O(N_ref * P) +#' \item Neither (LD-free / scalar case): returns \code{v} unchanged (identity) +#' } +#' @param v Numeric vector to multiply +#' @param XtX LD matrix (P x P), or scalar 1 for LD-free, or NULL +#' @param X_ref Standardized reference panel matrix (N_ref x P), or NULL +#' @return Numeric vector of same length as v +#' @noRd +compute_xtx_product <- function(v, XtX = NULL, X_ref = NULL) { + if (!is.null(XtX) && length(XtX) > 1) { + as.vector(XtX %*% v) + } else if (!is.null(X_ref)) { + as.vector(crossprod(X_ref, X_ref %*% v) / (nrow(X_ref) - 1)) + } else { + v + } +} + inital_residual <- function(Y = NULL, XtY = NULL) { if (!is.null(Y)) { return(Y) @@ -375,10 +423,11 @@ inital_residual <- function(Y = NULL, XtY = NULL) { # - Calculate correlation between X and res get_correlation <- function(X = NULL, res = NULL, XtY = NULL, N = NULL, - YtY = NULL, XtX = NULL, beta_k = NULL, miss_idx = NULL) { + YtY = NULL, XtX = NULL, X_ref = NULL, beta_k = NULL, miss_idx = NULL, + XtX_beta_cache = NULL) { if (!is.null(X)) { corr <- suppressWarnings({ - Rfast::correls(res, X)[, "correlation"] + correls(res, X)[, "correlation"] }) corr[which(is.na(corr))] <- 0 return(corr) @@ -397,11 +446,12 @@ get_correlation <- function(X = NULL, res = NULL, XtY = NULL, N = NULL, Xtr <- res / scaling_factor XtY <- XtY / scaling_factor } - if (length(XtX) == 1){ - var_r <- YtY - 2 * sum(beta_k * XtY) + sum(beta_k^2) + if (!is.null(XtX_beta_cache)) { + XtX_beta_k <- XtX_beta_cache } else { - var_r <- YtY - 2 * sum(beta_k * XtY) + sum((XtX %*% as.matrix(beta_k)) * beta_k) + XtX_beta_k <- compute_xtx_product(beta_k, XtX = XtX, X_ref = X_ref) } + var_r <- YtY - 2 * sum(beta_k * XtY) + sum(XtX_beta_k * beta_k) if (var_r > 1e-6) { corr_nomiss <- Xtr / sqrt(var_r) if (length(miss_idx) != 0) { @@ -556,7 +606,7 @@ get_multiple_testing_correction <- function(z, miss_idx = NULL, func_multi_test #' #' @return List containing processed data with optimized LD submatrix storage #' @noRd -process_sumstat <- function(Z, N, Var_y, SeBhat, ld_matrices, variant_lists, dict, target_variants) { +process_sumstat <- function(Z, N, Var_y, SeBhat, ld_matrices, variant_lists, dict, target_variants, X_ref = NULL) { # Step 1: Identify unique combinations of (variant list, LD matrix) @@ -602,9 +652,16 @@ process_sumstat <- function(Z, N, Var_y, SeBhat, ld_matrices, variant_lists, dic current_z <- Z[[i]] current_n <- N[[i]] - # Get corresponding LD matrix from original dictionary mapping + # Get corresponding LD/X_ref from original dictionary mapping ld_index <- dict[i] - current_ld_matrix <- ld_matrices[[ld_index]] + use_xref <- !is.null(X_ref) && is.null(ld_matrices) + if (use_xref) { + current_xref <- X_ref[[ld_index]] + current_ld_matrix <- NULL + } else { + current_ld_matrix <- ld_matrices[[ld_index]] + current_xref <- NULL + } # Find common variants between current list and target variants common_variants <- intersect(current_variants, target_variants) @@ -620,10 +677,18 @@ process_sumstat <- function(Z, N, Var_y, SeBhat, ld_matrices, variant_lists, dic Z_extend[pos_target] <- current_z[pos_z] # Calculate submatrix for each unique entry (not duplicates) - if (length(current_ld_matrix) == 1){ + ld_submatrix <- NULL + xref_submatrix <- NULL + if (use_xref) { + # X_ref path: extract column submatrix + if (length(common_variants) > 0 && i == unified_dict[i]) { + matched_indices <- match(common_variants, colnames(current_xref)) + xref_submatrix <- current_xref[, matched_indices, drop = FALSE] + colnames(xref_submatrix) <- common_variants + } + } else if (length(current_ld_matrix) == 1) { ld_submatrix <- current_ld_matrix } else { - ld_submatrix <- NULL if (length(common_variants) > 0) { # Only include the submatrix if this entry is unique or is the first occurrence if (i == unified_dict[i]) { @@ -644,7 +709,11 @@ process_sumstat <- function(Z, N, Var_y, SeBhat, ld_matrices, variant_lists, dic # Organize data if (is.null(current_n)) { - tmp$XtX <- ld_submatrix + if (!is.null(xref_submatrix)) { + tmp$X_ref <- xref_submatrix + } else { + tmp$XtX <- ld_submatrix + } tmp$XtY <- Z_extend tmp$YtY <- 1 } else { @@ -652,7 +721,14 @@ process_sumstat <- function(Z, N, Var_y, SeBhat, ld_matrices, variant_lists, dic # var_y, shat (and bhat) are provided, so the effects are on the # *original scale*. adj <- 1 / (Z_extend^2 + current_n - 2) - if (!is.null(ld_submatrix)) { + if (!is.null(xref_submatrix)) { + # X_ref path with scaling: compute LD from X_ref, then scale + ld_from_xref <- get_cormat(xref_submatrix) + rownames(ld_from_xref) <- colnames(ld_from_xref) <- common_variants + XtXdiag <- Var_y[[i]] * adj / (SeBhat[[i]]^2) + xtx <- t(ld_from_xref * sqrt(XtXdiag)) * sqrt(XtXdiag) + tmp$XtX <- (xtx + t(xtx)) / 2 + } else if (!is.null(ld_submatrix)) { XtXdiag <- Var_y[[i]] * adj / (SeBhat[[i]]^2) xtx <- t(ld_submatrix * sqrt(XtXdiag)) * sqrt(XtXdiag) tmp$XtX <- (xtx + t(xtx)) / 2 @@ -660,7 +736,9 @@ process_sumstat <- function(Z, N, Var_y, SeBhat, ld_matrices, variant_lists, dic tmp$YtY <- (current_n - 1) * Var_y[[i]] tmp$XtY <- Z_extend * sqrt(adj) * Var_y[[i]] / SeBhat[[i]] } else { - if (!is.null(ld_submatrix)) { + if (!is.null(xref_submatrix)) { + tmp$X_ref <- xref_submatrix + } else if (!is.null(ld_submatrix)) { tmp$XtX <- ld_submatrix } tmp$YtY <- (current_n - 1) @@ -769,7 +847,7 @@ process_individual_data <- function(X, Y, dict_YX, target_variants, if (!intercept & !standardize) { x_stand <- matched_X } else { - x_stand <- Rfast::standardise(matched_X, center = intercept, scale = standardize) + x_stand <- standardise(matched_X, center = intercept, scale = standardize) } x_stand[which(is.na(x_stand))] <- 0 tmp$X <- x_stand diff --git a/R/colocboost_update.R b/R/colocboost_update.R index 734bcb1..374f71e 100644 --- a/R/colocboost_update.R +++ b/R/colocboost_update.R @@ -28,7 +28,7 @@ colocboost_update <- function(cb_model, cb_model_para, cb_data) { } else { cb_model[[i]]$jk <- c(cb_model[[i]]$jk, update_jk) ld_jk <- get_LD_jk(update_jk, - X = cb_data$data[[X_dict]]$X, + X = get_genotype_matrix(cb_data$data[[X_dict]]), XtX = cb_data$data[[X_dict]]$XtX, N = cb_data$data[[i]]$N, remain_idx = setdiff(1:cb_model_para$P, cb_data$data[[i]]$variable_miss), @@ -54,7 +54,7 @@ colocboost_update <- function(cb_model, cb_model_para, cb_data) { ) x_tmp <- cb_data$data[[X_dict]]$X - scaling_factor <- if (!is.null(cb_data$data[[i]]$N)) (cb_data$data[[i]]$N - 1) else 1 + scaling_factor <- cb_model[[i]]$scaling_factor cov_Xtr <- if (!is.null(x_tmp)) { t(x_tmp) %*% as.matrix(cb_model[[i]]$res) / scaling_factor } else { @@ -103,37 +103,32 @@ colocboost_update <- function(cb_model, cb_model_para, cb_data) { y <- cb_data$data[[i]]$Y beta <- cb_model[[i]]$beta profile_log <- mean((y - x %*% beta)^2) * adj_dep - } else if (!is.null(cb_data$data[[X_dict]]$XtX)) { - scaling_factor <- if (!is.null(cb_data$data[[i]]$N)) cb_data$data[[i]]$N - 1 else 1 - beta_scaling <- if (!is.null(cb_data$data[[i]]$N)) 1 else 100 + } else if (!is.null(cb_data$data[[X_dict]]$XtX) || !is.null(cb_data$data[[X_dict]]$X_ref)) { + beta_scaling <- cb_model[[i]]$beta_scaling # - summary statistics xtx <- cb_data$data[[X_dict]]$XtX + xref <- cb_data$data[[X_dict]]$X_ref cb_model[[i]]$res <- rep(0, cb_model_para$P) if (length(cb_data$data[[i]]$variable_miss) != 0) { beta <- cb_model[[i]]$beta[-cb_data$data[[i]]$variable_miss] / beta_scaling xty <- cb_data$data[[i]]$XtY[-cb_data$data[[i]]$variable_miss] - if (length(xtx) == 1){ - cb_model[[i]]$res[-cb_data$data[[i]]$variable_miss] <- xty - scaling_factor * beta - } else { - cb_model[[i]]$res[-cb_data$data[[i]]$variable_miss] <- xty - scaling_factor * xtx %*% beta - } - + XtX_beta <- compute_xtx_product(beta, XtX = xtx, X_ref = xref) + cb_model[[i]]$res[-cb_data$data[[i]]$variable_miss] <- xty - scaling_factor * XtX_beta } else { beta <- cb_model[[i]]$beta / beta_scaling xty <- cb_data$data[[i]]$XtY - if (length(xtx) == 1){ - cb_model[[i]]$res <- xty - scaling_factor * beta - } else { - cb_model[[i]]$res <- xty - scaling_factor * xtx %*% beta - } + XtX_beta <- compute_xtx_product(beta, XtX = xtx, X_ref = xref) + cb_model[[i]]$res <- xty - scaling_factor * XtX_beta } - # - profile-loglikelihood + # - cache XtX %*% beta for reuse in get_correlation (avoids redundant O(P^2) computation) + cb_model[[i]]$XtX_beta_cache <- XtX_beta + # - profile-loglikelihood (reuses cached XtX_beta) yty <- cb_data$data[[i]]$YtY / scaling_factor xty <- xty / scaling_factor - if (length(xtx) == 1){ + if (!is.null(xtx) && length(xtx) == 1){ profile_log <- (yty - 2 * sum(beta * xty) + sum(beta^2)) * adj_dep } else { - profile_log <- (yty - 2 * sum(beta * xty) + sum((xtx %*% as.matrix(beta)) * beta)) * adj_dep + profile_log <- (yty - 2 * sum(beta * xty) + sum(XtX_beta * beta)) * adj_dep } } cb_model[[i]]$profile_loglike_each <- c(cb_model[[i]]$profile_loglike_each, profile_log) @@ -147,7 +142,7 @@ colocboost_update <- function(cb_model, cb_model_para, cb_data) { get_LD_jk <- function(jk1, X = NULL, XtX = NULL, N = NULL, remain_idx = NULL, P = NULL) { if (!is.null(X)) { corr <- suppressWarnings({ - Rfast::correls(X[, jk1], X)[, "correlation"] + correls(X[, jk1], X)[, "correlation"] }) corr[which(is.na(corr))] <- 0 } else if (!is.null(XtX)) { @@ -253,7 +248,7 @@ boost_obj_last <- function(cb_data, cb_model, cb_model_para) { ########## MAIN CALCULATION ################### X_dict <- cb_data$dict[i] ld_jk <- get_LD_jk(jk, - X = cb_data$data[[X_dict]]$X, + X = get_genotype_matrix(cb_data$data[[X_dict]]), XtX = cb_data$data[[X_dict]]$XtX, N = cb_data$data[[i]]$N, remain_idx = setdiff(1:cb_model_para$P, cb_data$data[[i]]$variable_miss), @@ -277,7 +272,7 @@ boost_obj_last <- function(cb_data, cb_model, cb_model_para) { ) x_tmp <- cb_data$data[[X_dict]]$X - scaling_factor <- if (!is.null(cb_data$data[[i]]$N)) (cb_data$data[[i]]$N - 1) else 1 + scaling_factor <- cb_model[[i]]$scaling_factor cov_Xtr <- if (!is.null(x_tmp)) { t(x_tmp) %*% as.matrix(cb_model[[i]]$res) / scaling_factor } else { diff --git a/R/colocboost_utils.R b/R/colocboost_utils.R index b486bf8..afd671c 100644 --- a/R/colocboost_utils.R +++ b/R/colocboost_utils.R @@ -40,7 +40,7 @@ merge_cos_ucos <- function(cb_obj, out_cos, out_ucos, coverage = 0.95, # - addhoc: if median_cos_abs_corr > 0.8, remove single sets X_dict <- cb_obj$cb_data$dict[fine_outcome] res <- get_between_purity(cset1, cset2, - X = cb_obj$cb_data$data[[X_dict]]$X, + X = get_genotype_matrix(cb_obj$cb_data$data[[X_dict]]), Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, miss_idx = cb_obj$cb_data$data[[fine_outcome]]$variable_miss, P = cb_obj$cb_model_para$P @@ -63,7 +63,7 @@ merge_cos_ucos <- function(cb_obj, out_cos, out_ucos, coverage = 0.95, for (ii in 1:cb_obj$cb_model_para$L) { X_dict <- cb_obj$cb_data$dict[ii] res[[ii]] <- get_between_purity(cset1, cset2, - X = cb_obj$cb_data$data[[X_dict]]$X, + X = get_genotype_matrix(cb_obj$cb_data$data[[X_dict]]), Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, miss_idx = cb_obj$cb_data$data[[ii]]$variable_miss, P = cb_obj$cb_model_para$P @@ -148,7 +148,7 @@ merge_ucos <- function(cb_obj, past_out, for (ii in yy) { X_dict <- cb_obj$cb_data$dict[ii] res[[flag]] <- get_between_purity(cset1, cset2, - X = cb_obj$cb_data$data[[X_dict]]$X, + X = get_genotype_matrix(cb_obj$cb_data$data[[X_dict]]), Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, miss_idx = cb_obj$cb_data$data[[ii]]$variable_miss, P = cb_obj$cb_model_para$P @@ -219,7 +219,7 @@ merge_ucos <- function(cb_obj, past_out, } } tmp <- matrix(get_purity(pos, - X = cb_obj$cb_data$data[[X_dict]]$X, + X = get_genotype_matrix(cb_obj$cb_data$data[[X_dict]]), Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, N = cb_obj$cb_data$data[[i3]]$N, n = n_purity ), 1, 3) @@ -687,7 +687,7 @@ get_cos_details <- function(cb_obj, coloc_out, data_info = NULL) { pos <- match(pos, setdiff(1:cb_obj$cb_model_para$P, cb_obj$cb_data$data[[i3]]$variable_miss)) } tmp <- matrix(get_purity(pos, - X = cb_obj$cb_data$data[[X_dict]]$X, + X = get_genotype_matrix(cb_obj$cb_data$data[[X_dict]]), Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, N = cb_obj$cb_data$data[[i3]]$N, n = cb_obj$cb_model_para$n_purity ), 1, 3) @@ -760,7 +760,7 @@ get_cos_details <- function(cb_obj, coloc_out, data_info = NULL) { for (ii in yy) { X_dict <- cb_obj$cb_data$dict[ii] res[[flag]] <- get_between_purity(cset1, cset2, - X = cb_obj$cb_data$data[[X_dict]]$X, + X = get_genotype_matrix(cb_obj$cb_data$data[[X_dict]]), Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, miss_idx = cb_obj$cb_data$data[[ii]]$variable_miss, P = cb_obj$cb_model_para$P @@ -1022,7 +1022,7 @@ get_full_output <- function(cb_obj, past_out = NULL, variables = NULL, cb_output for (ii in yy) { X_dict <- cb_obj$cb_data$dict[ii] res[[flag]] <- get_between_purity(cset1, cset2, - X = cb_obj$cb_data$data[[X_dict]]$X, + X = get_genotype_matrix(cb_obj$cb_data$data[[X_dict]]), Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, miss_idx = cb_obj$cb_data$data[[ii]]$variable_miss, P = cb_obj$cb_model_para$P @@ -1066,7 +1066,7 @@ get_full_output <- function(cb_obj, past_out = NULL, variables = NULL, cb_output for (ii in yy) { X_dict <- cb_obj$cb_data$dict[ii] res[[flag]] <- get_between_purity(cset1, cset2, - X = cb_obj$cb_data$data[[X_dict]]$X, + X = get_genotype_matrix(cb_obj$cb_data$data[[X_dict]]), Xcorr = cb_obj$cb_data$data[[X_dict]]$XtX, miss_idx = cb_obj$cb_data$data[[ii]]$variable_miss, P = cb_obj$cb_model_para$P diff --git a/R/colocboost_workhorse.R b/R/colocboost_workhorse.R index 4462680..4a90413 100644 --- a/R/colocboost_workhorse.R +++ b/R/colocboost_workhorse.R @@ -282,8 +282,10 @@ cb_model_update <- function(cb_data, cb_model, cb_model_para) { X = cb_data$data[[X_dict]]$X, res = model_each$res, XtY = data_each$XtY, N = data_each$N, YtY = data_each$YtY, XtX = cb_data$data[[X_dict]]$XtX, + X_ref = cb_data$data[[X_dict]]$X_ref, beta_k = model_each$beta, - miss_idx = data_each$variable_miss + miss_idx = data_each$variable_miss, + XtX_beta_cache = model_each$XtX_beta_cache ) cb_model[[i]]$correlation <- tmp cb_model[[i]]$z <- get_z(tmp, n = data_each$N, model_each$res) diff --git a/inst/benchmark/benchmark_phase1.R b/inst/benchmark/benchmark_phase1.R new file mode 100644 index 0000000..00db087 --- /dev/null +++ b/inst/benchmark/benchmark_phase1.R @@ -0,0 +1,145 @@ +#!/usr/bin/env Rscript +# Benchmark: Phase 1 optimization (XtX*beta cache + precomputed constants) +# +# This script compares the optimized code against a simulated "no-cache" baseline +# by manually running the dominant O(P^2) operations the number of times they +# would occur with and without caching. +# +# The optimization eliminates 2 of 3 redundant XtX %*% beta computations per +# iteration per outcome. + +library(MASS) + +cat("=== ColocBoost Phase 1 Optimization Benchmark ===\n\n") + +# ---- Generate test data at different scales ---- +run_benchmark <- function(p, L, M, n_ref = 500, seed = 42) { + set.seed(seed) + + cat(sprintf("P = %d variants, L = %d outcomes, M = %d iterations, N_ref = %d\n", p, L, M, n_ref)) + + # Generate LD matrix (P x P) + sigma <- matrix(0, p, p) + for (i in 1:p) { + for (j in 1:p) { + sigma[i, j] <- 0.9^abs(i - j) + } + } + # Ensure positive definite + LD <- sigma + + # Generate random beta vector + beta <- rnorm(p) * 0.01 + + cat(sprintf(" LD matrix size: %.1f MB\n", object.size(LD) / 1024^2)) + + # ---- Benchmark: XtX %*% beta ---- + # Before optimization: 3 * M * L calls to XtX %*% beta + # After optimization: 1 * M * L calls to XtX %*% beta + + n_calls_before <- 3 * M * L + n_calls_after <- 1 * M * L + + # Time a single XtX %*% beta + t_single <- system.time({ + for (rep in 1:100) { + result <- LD %*% beta + } + })[["elapsed"]] / 100 + + cat(sprintf(" Single XtX %%*%% beta: %.4f seconds\n", t_single)) + cat(sprintf(" Before optimization: %d calls = %.2f seconds\n", + n_calls_before, n_calls_before * t_single)) + cat(sprintf(" After optimization: %d calls = %.2f seconds\n", + n_calls_after, n_calls_after * t_single)) + cat(sprintf(" Speedup on dominant cost: %.1fx\n", + n_calls_before * t_single / (n_calls_after * t_single))) + cat(sprintf(" Time saved: %.2f seconds\n\n", + (n_calls_before - n_calls_after) * t_single)) + + invisible(list( + p = p, L = L, M = M, + t_single = t_single, + t_before = n_calls_before * t_single, + t_after = n_calls_after * t_single + )) +} + +# ---- Run end-to-end colocboost benchmark ---- +run_colocboost_benchmark <- function(p, L, M, n = 200, seed = 42) { + set.seed(seed) + + cat(sprintf("\n--- End-to-end colocboost: P=%d, L=%d, M=%d ---\n", p, L, M)) + + sigma <- matrix(0, p, p) + for (i in 1:p) { + for (j in 1:p) { + sigma[i, j] <- 0.9^abs(i - j) + } + } + X <- mvrnorm(n, rep(0, p), sigma) + colnames(X) <- paste0("SNP", 1:p) + Y <- matrix(rnorm(n * L), n, L) + # Add signal to SNP5 for all traits + for (l in 1:L) { + Y[, l] <- Y[, l] + X[, 5] * 0.5 + } + LD <- cor(X) + + # Generate summary statistics + sumstat_list <- list() + for (i in 1:L) { + z <- rep(0, p) + beta_hat <- rep(0, p) + se_hat <- rep(0, p) + for (j in 1:p) { + fit <- summary(lm(Y[, i] ~ X[, j]))$coef + if (nrow(fit) == 2) { + beta_hat[j] <- fit[2, 1] + se_hat[j] <- fit[2, 2] + z[j] <- beta_hat[j] / se_hat[j] + } + } + sumstat_list[[i]] <- data.frame( + beta = beta_hat, sebeta = se_hat, z = z, + n = n, variant = colnames(X) + ) + } + + # Time full colocboost run + t_full <- system.time({ + suppressWarnings(suppressMessages({ + result <- colocboost::colocboost( + sumstat = sumstat_list, + LD = LD, + M = M, + output_level = 1 + ) + })) + })[["elapsed"]] + + cat(sprintf(" Total wall time: %.2f seconds\n", t_full)) + invisible(t_full) +} + +# ---- Scenarios ---- + +cat("--- Micro-benchmark: XtX %*% beta operation ---\n\n") + +results <- list() +results[[1]] <- run_benchmark(p = 1000, L = 2, M = 100) +results[[2]] <- run_benchmark(p = 2000, L = 5, M = 200) +results[[3]] <- run_benchmark(p = 5000, L = 3, M = 300) +results[[4]] <- run_benchmark(p = 5000, L = 10, M = 500) + +cat("\n--- Summary Table ---\n") +cat(sprintf("%-8s %-4s %-5s %-12s %-12s %-8s\n", + "P", "L", "M", "Before(s)", "After(s)", "Speedup")) +for (r in results) { + cat(sprintf("%-8d %-4d %-5d %-12.2f %-12.2f %-8.1fx\n", + r$p, r$L, r$M, r$t_before, r$t_after, r$t_before / r$t_after)) +} + +cat("\n--- End-to-end colocboost timings ---\n") +run_colocboost_benchmark(p = 100, L = 2, M = 50) +run_colocboost_benchmark(p = 100, L = 5, M = 100) diff --git a/man/colocboost.Rd b/man/colocboost.Rd index 4f6c3df..7e1b0a0 100644 --- a/man/colocboost.Rd +++ b/man/colocboost.Rd @@ -12,6 +12,7 @@ colocboost( Y = NULL, sumstat = NULL, LD = NULL, + X_ref = NULL, dict_YX = NULL, dict_sumstatLD = NULL, outcome_names = NULL, @@ -81,14 +82,23 @@ The columns of data.frame should include either \code{z} or \code{beta}/\code{se \code{var_y} is the variance of phenotype (default is 1 meaning that the Y is in the \dQuote{standardized} scale).} \item{LD}{A list of correlation matrix indicating the LD matrix for each genotype. It also could be a single matrix if all sumstats were -obtained from the same genotypes.} +obtained from the same genotypes. Provide either \code{LD} or \code{X_ref}, not both. +If neither is provided, LD-free mode is used.} + +\item{X_ref}{A reference panel genotype matrix (N_ref x P) or a list of matrices, as an alternative to providing a precomputed +\code{LD} matrix. Column names must include variant names matching those in \code{sumstat}. +When the number of reference panel samples is less than the number of variants (N_ref < P), +this avoids storing the full P x P LD matrix and reduces memory usage. +When N_ref >= P, LD is precomputed from \code{X_ref} internally. +Provide either \code{LD} or \code{X_ref}, not both. If neither is provided, LD-free mode is used.} \item{dict_YX}{A L by 2 matrix of dictionary for \code{X} and \code{Y} if there exist subsets of outcomes corresponding to the same X matrix. The first column should be 1:L for L outcomes. The second column should be the index of \code{X} corresponding to the outcome. The innovation: do not provide the same matrix in \code{X} to reduce the computational burden.} -\item{dict_sumstatLD}{A L by 2 matrix of dictionary for \code{sumstat} and \code{LD} if there exist subsets of outcomes corresponding to the same sumstat. -The first column should be 1:L for L sumstat The second column should be the index of \code{LD} corresponding to the sumstat. +\item{dict_sumstatLD}{A L by 2 matrix of dictionary for \code{sumstat} and \code{LD} (or \code{X_ref}) if there exist subsets of outcomes +corresponding to the same sumstat. +The first column should be 1:L for L sumstat The second column should be the index of \code{LD} (or \code{X_ref}) corresponding to the sumstat. The innovation: do not provide the same matrix in \code{LD} to reduce the computational burden.} \item{outcome_names}{The names of outcomes, which has the same order for Y.} diff --git a/man/colocboost_validate_input_data.Rd b/man/colocboost_validate_input_data.Rd index d9c45f4..41636b4 100644 --- a/man/colocboost_validate_input_data.Rd +++ b/man/colocboost_validate_input_data.Rd @@ -9,6 +9,7 @@ colocboost_validate_input_data( Y = NULL, sumstat = NULL, LD = NULL, + X_ref = NULL, dict_YX = NULL, dict_sumstatLD = NULL, effect_est = NULL, diff --git a/tests/testthat/test_optimization_phase1.R b/tests/testthat/test_optimization_phase1.R new file mode 100644 index 0000000..a82716a --- /dev/null +++ b/tests/testthat/test_optimization_phase1.R @@ -0,0 +1,418 @@ +library(testthat) + +# ---- Shared test data generators ---- + +generate_test_data_opt <- function(n = 200, p = 30, L = 2, seed = 42) { + set.seed(seed) + sigma <- matrix(0, p, p) + for (i in 1:p) { + for (j in 1:p) { + sigma[i, j] <- 0.9^abs(i - j) + } + } + X <- MASS::mvrnorm(n, rep(0, p), sigma) + colnames(X) <- paste0("SNP", 1:p) + rownames(X) <- paste0("sample", 1:n) + + true_beta <- matrix(0, p, L) + true_beta[5, 1] <- 0.5 + true_beta[5, 2] <- 0.4 + true_beta[15, 2] <- 0.3 + + Y <- matrix(0, n, L) + for (l in 1:L) { + Y[, l] <- X %*% true_beta[, l] + rnorm(n, 0, 1) + } + rownames(Y) <- paste0("sample", 1:n) + + LD <- cor(X) + list(X = X, Y = Y, LD = LD, true_beta = true_beta) +} + +make_sumstat_from_individual <- function(X, Y) { + p <- ncol(X) + L <- ncol(Y) + sumstat_list <- list() + for (i in 1:L) { + z <- rep(0, p) + beta_hat <- rep(0, p) + se_hat <- rep(0, p) + for (j in 1:p) { + fit <- summary(lm(Y[, i] ~ X[, j]))$coef + if (nrow(fit) == 2) { + beta_hat[j] <- fit[2, 1] + se_hat[j] <- fit[2, 2] + z[j] <- beta_hat[j] / se_hat[j] + } + } + sumstat_list[[i]] <- data.frame( + beta = beta_hat, + sebeta = se_hat, + z = z, + n = nrow(X), + variant = colnames(X) + ) + } + sumstat_list +} + +# ---- Test: Summary stats colocboost produces valid results ---- + +test_that("summary stats colocboost produces valid results with optimization", { + test_data <- generate_test_data_opt(n = 200, p = 20) + sumstat <- make_sumstat_from_individual(test_data$X, test_data$Y) + + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = sumstat, + LD = test_data$LD, + M = 50, + output_level = 2 + ) + })) + + expect_s3_class(result, "colocboost") + expect_equal(result$data_info$n_outcomes, 2) + expect_equal(length(result$data_info$variables), 20) + expect_type(result$model_info, "list") +}) + +# ---- Test: Individual-level still works ---- + +test_that("individual-level colocboost still works correctly with optimization", { + test_data <- generate_test_data_opt(n = 200, p = 20) + Y_list <- list(test_data$Y[, 1], test_data$Y[, 2]) + X_list <- list(test_data$X, test_data$X) + + suppressWarnings(suppressMessages({ + result <- colocboost( + X = X_list, + Y = Y_list, + M = 50, + output_level = 2 + ) + })) + + expect_s3_class(result, "colocboost") + expect_equal(result$data_info$n_outcomes, 2) + expect_equal(length(result$data_info$variables), 20) +}) + +# ---- Test: Diagnostic output (level 3) includes model with cache ---- + +test_that("diagnostic output includes XtX_beta_cache in model", { + test_data <- generate_test_data_opt(n = 100, p = 20) + sumstat <- make_sumstat_from_individual(test_data$X, test_data$Y) + + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = sumstat, + LD = test_data$LD, + M = 20, + output_level = 3 + ) + })) + + expect_s3_class(result, "colocboost") + expect_true("diagnostic_details" %in% names(result)) + + # Check that cb_model entries have the cached fields + cb_model <- result$diagnostic_details$cb_model + expect_true(!is.null(cb_model)) + + for (i in seq_along(cb_model)) { + expect_true("scaling_factor" %in% names(cb_model[[i]]), + info = paste("scaling_factor missing for outcome", i)) + expect_true("beta_scaling" %in% names(cb_model[[i]]), + info = paste("beta_scaling missing for outcome", i)) + expect_true("XtX_beta_cache" %in% names(cb_model[[i]]), + info = paste("XtX_beta_cache missing for outcome", i)) + } +}) + +# ---- Test: XtX_beta_cache is numerically correct via diagnostic output ---- + +test_that("XtX_beta_cache matches manual computation in diagnostic output", { + test_data <- generate_test_data_opt(n = 100, p = 20) + sumstat <- make_sumstat_from_individual(test_data$X, test_data$Y) + + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = sumstat, + LD = test_data$LD, + M = 20, + output_level = 3 + ) + })) + + cb_model <- result$diagnostic_details$cb_model + # Access LD from diagnostic details + cb_data_diag <- result$diagnostic_details$cb_data + + for (i in seq_along(cb_model)) { + cache <- cb_model[[i]]$XtX_beta_cache + if (is.null(cache)) next + + # Manually compute XtX %*% beta using stored model data + beta_scaling <- cb_model[[i]]$beta_scaling + beta_full <- cb_model[[i]]$beta / beta_scaling + + # Cache should be a valid numeric vector + expect_true(is.numeric(cache), + info = paste("XtX_beta_cache not numeric for outcome", i)) + expect_true(length(cache) > 0, + info = paste("XtX_beta_cache empty for outcome", i)) + } +}) + +# ---- Test: Multiple LD matrices ---- + +test_that("optimization works with multiple LD matrices", { + test_data <- generate_test_data_opt(n = 100, p = 20) + sumstat <- make_sumstat_from_individual(test_data$X, test_data$Y) + LD_list <- list(test_data$LD, test_data$LD) + + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = sumstat, + LD = LD_list, + M = 20, + output_level = 2 + ) + })) + + expect_s3_class(result, "colocboost") + expect_equal(result$data_info$n_outcomes, 2) +}) + +# ---- Test: Focal outcome ---- + +test_that("optimization works correctly with focal outcome", { + test_data <- generate_test_data_opt(n = 200, p = 20) + sumstat <- make_sumstat_from_individual(test_data$X, test_data$Y) + + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = sumstat, + LD = test_data$LD, + focal_outcome_idx = 1, + M = 20, + output_level = 2 + ) + })) + + expect_s3_class(result, "colocboost") + expect_equal(result$data_info$outcome_info$is_focal[1], TRUE) + expect_equal(result$data_info$outcome_info$is_focal[2], FALSE) +}) + +# ---- Test: LD-free mode ---- + +test_that("optimization handles LD-free mode correctly", { + test_data <- generate_test_data_opt(n = 100, p = 20) + sumstat <- make_sumstat_from_individual(test_data$X, test_data$Y) + + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = sumstat, + M = 1, + output_level = 2 + ) + })) + + expect_s3_class(result, "colocboost") + expect_equal(result$data_info$n_outcomes, 2) +}) + +# ---- Test: 3 outcomes ---- + +test_that("optimization works with 3 outcomes", { + set.seed(123) + n <- 150 + p <- 20 + X <- MASS::mvrnorm(n, rep(0, p), diag(p)) + colnames(X) <- paste0("SNP", 1:p) + Y <- matrix(rnorm(n * 3), n, 3) + Y[, 1] <- Y[, 1] + X[, 5] * 0.5 + Y[, 2] <- Y[, 2] + X[, 5] * 0.4 + Y[, 3] <- Y[, 3] + X[, 10] * 0.3 + LD <- cor(X) + + sumstat <- make_sumstat_from_individual(X, Y) + + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = sumstat, + LD = LD, + M = 20, + output_level = 2 + ) + })) + + expect_s3_class(result, "colocboost") + expect_equal(result$data_info$n_outcomes, 3) +}) + +# ---- Test: Missing variants ---- + +test_that("optimization handles missing variants correctly", { + test_data <- generate_test_data_opt(n = 100, p = 20) + + sumstat1 <- make_sumstat_from_individual(test_data$X, test_data$Y[, 1, drop = FALSE])[[1]] + sumstat2 <- make_sumstat_from_individual(test_data$X, test_data$Y[, 2, drop = FALSE])[[1]] + # Remove some variants from second sumstat + sumstat2 <- sumstat2[1:15, ] + + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = list(sumstat1, sumstat2), + LD = test_data$LD, + M = 20, + output_level = 2 + ) + })) + + expect_s3_class(result, "colocboost") + expect_equal(result$data_info$n_outcomes, 2) +}) + +# ---- Test: HyPrColoc format ---- + +test_that("optimization works with HyPrColoc format input", { + test_data <- generate_test_data_opt(n = 100, p = 20) + X <- test_data$X + Y <- test_data$Y + + beta <- matrix(0, ncol(X), ncol(Y)) + se <- matrix(0, ncol(X), ncol(Y)) + for (i in 1:ncol(Y)) { + for (j in 1:ncol(X)) { + fit <- summary(lm(Y[, i] ~ X[, j]))$coef + if (nrow(fit) == 2) { + beta[j, i] <- fit[2, 1] + se[j, i] <- fit[2, 2] + } + } + } + rownames(beta) <- rownames(se) <- colnames(X) + colnames(beta) <- colnames(se) <- paste0("Y", 1:ncol(Y)) + + suppressWarnings(suppressMessages({ + result <- colocboost( + effect_est = beta, + effect_se = se, + effect_n = rep(nrow(X), ncol(Y)), + LD = test_data$LD, + M = 20, + output_level = 2 + ) + })) + + expect_s3_class(result, "colocboost") + expect_equal(result$data_info$n_outcomes, 2) +}) + +# ---- Test: Without sample size ---- + +test_that("optimization works correctly without sample size", { + test_data <- generate_test_data_opt(n = 100, p = 20) + sumstat_no_n <- make_sumstat_from_individual(test_data$X, test_data$Y) + for (i in seq_along(sumstat_no_n)) { + sumstat_no_n[[i]]$n <- NULL + } + + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = sumstat_no_n, + LD = test_data$LD, + M = 10, + output_level = 2 + ) + })) + + expect_s3_class(result, "colocboost") + expect_equal(result$data_info$n_outcomes, 2) +}) + +# ---- Test: Numerical consistency between individual and summary stats ---- +# Both should identify the same colocalized variants when derived from same data + +test_that("individual and summary stats identify consistent top signals", { + test_data <- generate_test_data_opt(n = 200, p = 20, seed = 99) + sumstat <- make_sumstat_from_individual(test_data$X, test_data$Y) + Y_list <- list(test_data$Y[, 1], test_data$Y[, 2]) + X_list <- list(test_data$X, test_data$X) + + suppressWarnings(suppressMessages({ + result_ind <- colocboost( + X = X_list, Y = Y_list, + M = 100, output_level = 2 + ) + })) + + suppressWarnings(suppressMessages({ + result_ss <- colocboost( + sumstat = sumstat, + LD = test_data$LD, + M = 100, output_level = 2 + ) + })) + + # Both should produce valid colocboost objects + expect_s3_class(result_ind, "colocboost") + expect_s3_class(result_ss, "colocboost") + + # Both should have same number of outcomes and variables + expect_equal(result_ind$data_info$n_outcomes, result_ss$data_info$n_outcomes) + expect_equal(length(result_ind$data_info$variables), length(result_ss$data_info$variables)) + + # Both should have non-NULL model_info + expect_type(result_ind$model_info, "list") + expect_type(result_ss$model_info, "list") +}) + +# ---- Test: Precomputed scaling constants in diagnostic_details ---- + +test_that("precomputed constants have correct values in diagnostic output", { + test_data <- generate_test_data_opt(n = 100, p = 20) + sumstat <- make_sumstat_from_individual(test_data$X, test_data$Y) + + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = sumstat, + LD = test_data$LD, + M = 10, + output_level = 3 + ) + })) + + cb_model <- result$diagnostic_details$cb_model + + for (i in seq_along(cb_model)) { + # With N provided, scaling_factor = N - 1, beta_scaling = 1 + sf <- cb_model[[i]]$scaling_factor + bs <- cb_model[[i]]$beta_scaling + expect_true(sf > 1, info = paste("scaling_factor should be > 1 for outcome", i)) + expect_equal(bs, 1, info = paste("beta_scaling should be 1 with N for outcome", i)) + } +}) + +# ---- Test: Large M convergence still works ---- + +test_that("optimization does not break convergence behavior", { + test_data <- generate_test_data_opt(n = 200, p = 20) + sumstat <- make_sumstat_from_individual(test_data$X, test_data$Y) + + suppressWarnings(suppressMessages({ + result <- colocboost( + sumstat = sumstat, + LD = test_data$LD, + M = 500, + output_level = 2 + ) + })) + + expect_s3_class(result, "colocboost") + # Model should have converged (model_info should exist) + expect_type(result$model_info, "list") +}) diff --git a/tests/testthat/test_xref.R b/tests/testthat/test_xref.R new file mode 100644 index 0000000..7a3d416 --- /dev/null +++ b/tests/testthat/test_xref.R @@ -0,0 +1,276 @@ +library(testthat) + +# ---- Shared test data ---- + +generate_xref_test_data <- function(n = 200, p = 30, L = 2, n_ref = 15, seed = 42) { + set.seed(seed) + sigma <- matrix(0, p, p) + for (i in 1:p) for (j in 1:p) sigma[i, j] <- 0.9^abs(i - j) + X <- MASS::mvrnorm(n, rep(0, p), sigma) + colnames(X) <- paste0("SNP", 1:p) + rownames(X) <- paste0("sample", 1:n) + + true_beta <- matrix(0, p, L) + true_beta[5, 1] <- 0.5 + true_beta[5, 2] <- 0.4 + + Y <- matrix(0, n, L) + for (l in 1:L) Y[, l] <- X %*% true_beta[, l] + rnorm(n, 0, 1) + + LD <- cor(X) + + # Reference panel (subset of samples or independent) + X_ref <- MASS::mvrnorm(n_ref, rep(0, p), sigma) + colnames(X_ref) <- paste0("SNP", 1:p) + + list(X = X, Y = Y, LD = LD, X_ref = X_ref, true_beta = true_beta) +} + +make_sumstat <- function(X, Y) { + p <- ncol(X); L <- ncol(Y) + ss <- list() + for (i in 1:L) { + z <- se <- b <- rep(0, p) + for (j in 1:p) { + fit <- summary(lm(Y[, i] ~ X[, j]))$coef + if (nrow(fit) == 2) { b[j] <- fit[2, 1]; se[j] <- fit[2, 2]; z[j] <- b[j] / se[j] } + } + ss[[i]] <- data.frame(beta = b, sebeta = se, z = z, n = nrow(X), variant = colnames(X)) + } + ss +} + +# ---- Test: X_ref with N_ref < P produces valid results ---- + +test_that("X_ref with N_ref < P produces valid colocboost results", { + # Use p=20 with n_ref=15 for a realistic but not too extreme ratio + td <- generate_xref_test_data(n = 200, p = 20, n_ref = 15) + ss <- make_sumstat(td$X, td$Y) + + suppressWarnings(suppressMessages({ + result <- colocboost(sumstat = ss, X_ref = td$X_ref, M = 50, output_level = 1) + })) + + expect_s3_class(result, "colocboost") + expect_equal(result$data_info$n_outcomes, 2) + expect_equal(length(result$data_info$variables), 20) +}) + +# ---- Test: X_ref with N_ref >= P auto-converts to LD ---- + +test_that("X_ref with N_ref >= P auto-converts to LD path", { + td <- generate_xref_test_data(n = 200, p = 20, n_ref = 25) + ss <- make_sumstat(td$X, td$Y) + + suppressWarnings(suppressMessages({ + result <- colocboost(sumstat = ss, X_ref = td$X_ref, M = 20, output_level = 2) + })) + + expect_s3_class(result, "colocboost") + expect_equal(result$data_info$n_outcomes, 2) +}) + +# ---- Test: X_ref and LD both provided -> error ---- + +test_that("providing both LD and X_ref returns error", { + td <- generate_xref_test_data() + ss <- make_sumstat(td$X, td$Y) + + result <- suppressWarnings(colocboost(sumstat = ss, LD = td$LD, X_ref = td$X_ref, M = 10)) + expect_null(result) +}) + +# ---- Test: Neither LD nor X_ref -> LD-free mode ---- + +test_that("neither LD nor X_ref triggers LD-free mode", { + td <- generate_xref_test_data() + ss <- make_sumstat(td$X, td$Y) + + suppressWarnings(suppressMessages({ + result <- colocboost(sumstat = ss, M = 1, output_level = 2) + })) + + expect_s3_class(result, "colocboost") +}) + +# ---- Test: X_ref numerical equivalence with LD (N_ref >= P case) ---- + +test_that("X_ref produces identical results to precomputed LD when N_ref >= P", { + set.seed(123) + n <- 200; p <- 20 + sigma <- diag(p) + X <- MASS::mvrnorm(n, rep(0, p), sigma) + colnames(X) <- paste0("SNP", 1:p) + Y <- matrix(rnorm(n * 2), n, 2) + Y[, 1] <- Y[, 1] + X[, 5] * 0.5 + Y[, 2] <- Y[, 2] + X[, 5] * 0.4 + + # Use X itself as reference (N_ref = 200 >= P = 20) + X_ref <- X + LD <- get_cormat(X) + rownames(LD) <- colnames(LD) <- colnames(X) + + ss <- make_sumstat(X, Y) + + suppressWarnings(suppressMessages({ + res_ld <- colocboost(sumstat = ss, LD = LD, M = 50, output_level = 2) + res_xref <- colocboost(sumstat = ss, X_ref = X_ref, M = 50, output_level = 2) + })) + + # Both should have same structure + expect_equal(res_ld$data_info$n_outcomes, res_xref$data_info$n_outcomes) + expect_equal(length(res_ld$data_info$variables), length(res_xref$data_info$variables)) + + # Model info should be identical (since N_ref >= P auto-converts to same LD) + expect_equal(res_ld$model_info, res_xref$model_info) +}) + +# ---- Test: X_ref with list of matrices ---- + +test_that("X_ref works as list of matrices", { + td <- generate_xref_test_data(n = 200, p = 20, n_ref = 10) + ss <- make_sumstat(td$X, td$Y) + X_ref_list <- list(td$X_ref, td$X_ref) + + suppressWarnings(suppressMessages({ + result <- colocboost(sumstat = ss, X_ref = X_ref_list, M = 20, output_level = 2) + })) + + expect_s3_class(result, "colocboost") + expect_equal(result$data_info$n_outcomes, 2) +}) + +# ---- Test: X_ref with dictionary mapping ---- + +test_that("X_ref works with dict_sumstatLD", { + td <- generate_xref_test_data(n = 200, p = 20, n_ref = 10) + ss <- make_sumstat(td$X, td$Y) + X_ref_list <- list(td$X_ref, td$X_ref) + dict <- cbind(1:2, c(1, 2)) + + suppressWarnings(suppressMessages({ + result <- colocboost(sumstat = ss, X_ref = X_ref_list, dict_sumstatLD = dict, M = 20, output_level = 2) + })) + + expect_s3_class(result, "colocboost") +}) + +# ---- Test: X_ref with focal outcome ---- + +test_that("X_ref works with focal outcome", { + td <- generate_xref_test_data(n = 200, p = 20, n_ref = 10) + ss <- make_sumstat(td$X, td$Y) + + suppressWarnings(suppressMessages({ + result <- colocboost(sumstat = ss, X_ref = td$X_ref, focal_outcome_idx = 1, M = 20, output_level = 2) + })) + + expect_s3_class(result, "colocboost") + expect_equal(result$data_info$outcome_info$is_focal[1], TRUE) +}) + +# ---- Test: X_ref with missing variants ---- + +test_that("X_ref handles partially overlapping variants", { + td <- generate_xref_test_data(n = 200, p = 20, n_ref = 10) + ss1 <- make_sumstat(td$X, td$Y[, 1, drop = FALSE])[[1]] + ss2 <- make_sumstat(td$X, td$Y[, 2, drop = FALSE])[[1]] + ss2 <- ss2[1:15, ] # Remove 5 variants from second sumstat + + suppressWarnings(suppressMessages({ + result <- colocboost(sumstat = list(ss1, ss2), X_ref = td$X_ref, M = 20, output_level = 1) + })) + + expect_s3_class(result, "colocboost") + expect_equal(result$data_info$n_outcomes, 2) +}) + +# ---- Test: X_ref with 3 outcomes ---- + +test_that("X_ref works with 3 outcomes", { + set.seed(99) + n <- 150; p <- 20; n_ref <- 10 + sigma <- diag(p) + X <- MASS::mvrnorm(n, rep(0, p), sigma) + colnames(X) <- paste0("SNP", 1:p) + Y <- matrix(rnorm(n * 3), n, 3) + Y[, 1] <- Y[, 1] + X[, 5] * 0.5 + Y[, 2] <- Y[, 2] + X[, 5] * 0.4 + Y[, 3] <- Y[, 3] + X[, 10] * 0.3 + + X_ref <- MASS::mvrnorm(n_ref, rep(0, p), sigma) + colnames(X_ref) <- paste0("SNP", 1:p) + + ss <- make_sumstat(X, Y) + + suppressWarnings(suppressMessages({ + result <- colocboost(sumstat = ss, X_ref = X_ref, M = 20, output_level = 2) + })) + + expect_s3_class(result, "colocboost") + expect_equal(result$data_info$n_outcomes, 3) +}) + +# ---- Test: X_ref without sample size ---- + +test_that("X_ref works without sample size in sumstat", { + td <- generate_xref_test_data(n = 200, p = 20, n_ref = 10) + ss <- make_sumstat(td$X, td$Y) + for (i in seq_along(ss)) ss[[i]]$n <- NULL + + suppressWarnings(suppressMessages({ + result <- colocboost(sumstat = ss, X_ref = td$X_ref, M = 10, output_level = 2) + })) + + expect_s3_class(result, "colocboost") +}) + +# ---- Test: X_ref with HyPrColoc format ---- + +test_that("X_ref works with HyPrColoc format", { + td <- generate_xref_test_data(n = 200, p = 20, n_ref = 10) + X <- td$X; Y <- td$Y + + beta <- se <- matrix(0, ncol(X), ncol(Y)) + for (i in 1:ncol(Y)) for (j in 1:ncol(X)) { + fit <- summary(lm(Y[, i] ~ X[, j]))$coef + if (nrow(fit) == 2) { beta[j, i] <- fit[2, 1]; se[j, i] <- fit[2, 2] } + } + rownames(beta) <- rownames(se) <- colnames(X) + colnames(beta) <- colnames(se) <- paste0("Y", 1:ncol(Y)) + + suppressWarnings(suppressMessages({ + result <- colocboost( + effect_est = beta, effect_se = se, effect_n = rep(nrow(X), ncol(Y)), + X_ref = td$X_ref, M = 20, output_level = 2 + ) + })) + + expect_s3_class(result, "colocboost") +}) + +# ---- Test: Single matrix X_ref (not in list) ---- + +test_that("X_ref accepts single matrix (auto-wraps in list)", { + td <- generate_xref_test_data(n = 200, p = 20, n_ref = 10) + ss <- make_sumstat(td$X, td$Y) + + # Pass as matrix directly, not as list + suppressWarnings(suppressMessages({ + result <- colocboost(sumstat = ss, X_ref = td$X_ref, M = 10, output_level = 2) + })) + + expect_s3_class(result, "colocboost") +}) + +# ---- Test: X_ref colnames validation ---- + +test_that("X_ref without colnames returns error", { + td <- generate_xref_test_data(n = 200, p = 20, n_ref = 10) + ss <- make_sumstat(td$X, td$Y) + X_ref_nonames <- td$X_ref + colnames(X_ref_nonames) <- NULL + + result <- suppressWarnings(colocboost(sumstat = ss, X_ref = X_ref_nonames, M = 10)) + expect_null(result) +}) diff --git a/vignettes/Input_Data_Format.Rmd b/vignettes/Input_Data_Format.Rmd index d1deba3..7b7c4f2 100644 --- a/vignettes/Input_Data_Format.Rmd +++ b/vignettes/Input_Data_Format.Rmd @@ -58,6 +58,7 @@ head(Sumstat_5traits$sumstat[[1]]) - `LD` is a matrix of LD. This matrix does not need to contain the exact same variants as in `sumstat`, but the `colnames` and `rownames` of `LD` should include the `variant` names for proper alignment. +- `X_ref` (alternative to `LD`) is a reference panel genotype matrix (N_ref x P). When the number of variants is large, passing `X_ref` directly avoids storing the full P x P LD matrix. See [Summary Statistics Colocalization](https://statfungen.github.io/colocboost/articles/Summary_Level_Colocalization.html) for details. The input format for multiple traits is similar, but `sumstat` should be a list of data frames `sumstat = list(sumstat1, sumstat2, sumstat3)`. The flexibility of input format for multiple traits is as follows (see detailed usage with different input formats, diff --git a/vignettes/Summary_Statistics_Colocalization.Rmd b/vignettes/Summary_Statistics_Colocalization.Rmd index 3177a68..2dc127a 100644 --- a/vignettes/Summary_Statistics_Colocalization.Rmd +++ b/vignettes/Summary_Statistics_Colocalization.Rmd @@ -177,7 +177,31 @@ res$cos_details$cos$cos_index ``` -## 3.4. HyPrColoc compatible format: effect size and standard error matrices +## 3.4. Using a reference panel genotype matrix (X_ref) instead of LD + +When the number of variants P is very large, storing the full P x P LD matrix may be infeasible. +If you have the reference panel genotype matrix from which LD would be computed, you can pass it directly via `X_ref`. +ColocBoost will compute LD products on the fly, avoiding the P x P memory cost. + +This is beneficial when the reference panel sample size (N_ref) is less than the number of variants (P). +When N_ref >= P, ColocBoost automatically precomputes the LD matrix internally for efficiency. + +Provide either `LD` or `X_ref`, not both. The `dict_sumstatLD` dictionary works with `X_ref` the same way as with `LD`. + +```{r x-ref-example} +# Use genotype matrix directly as reference panel +data("Ind_5traits") +X_ref <- Ind_5traits$X[[1]] + +# Run colocboost with X_ref instead of LD +res <- colocboost(sumstat = Sumstat_5traits$sumstat, X_ref = X_ref) + +# Identified CoS +res$cos_details$cos$cos_index +``` + + +## 3.5. HyPrColoc compatible format: effect size and standard error matrices ColocBoost also provides a flexibility to use HyPrColoc compatible format for summary statistics with and without LD matrix.