diff --git a/R/colocboost.R b/R/colocboost.R index d2f994f..82bff67 100644 --- a/R/colocboost.R +++ b/R/colocboost.R @@ -468,8 +468,8 @@ colocboost <- function(X = NULL, Y = NULL, # individual data warning( "Providing the sample size (n), or even a rough estimate of n, ", "is highly recommended. Without n, the implicit assumption is ", - "n is large (Inf) and the effect sizes are small (close to zero).", - "outcome ", paste(p_no, collapse = ","), " in sumstat don't contain 'n'!" + "n is large (Inf) and the effect sizes are small (close to zero). ", + "Outcome ", paste(p_no, collapse = ", "), " in sumstat don't contain 'n'!" ) } diff --git a/R/colocboost_assemble.R b/R/colocboost_assemble.R index 4fc64e3..964376c 100644 --- a/R/colocboost_assemble.R +++ b/R/colocboost_assemble.R @@ -126,7 +126,7 @@ colocboost_assemble <- function(cb_obj, if (!is.null(cb_obj_single$cb_data$data[[1]][["XtY"]])) { if (is.null(cb_obj_single$cb_data$data[[1]]$XtX)) { X_dict <- cb_obj$cb_data$dict[i] - cb_obj_single$cb_data$data[[1]]$X <- cb_obj$cb_data$data[[X_dict]]$XtX + cb_obj_single$cb_data$data[[1]]$XtX <- cb_obj$cb_data$data[[X_dict]]$XtX } } class(cb_obj_single) <- "colocboost" diff --git a/R/colocboost_check_update_jk.R b/R/colocboost_check_update_jk.R index bdc1f45..9e78c14 100644 --- a/R/colocboost_check_update_jk.R +++ b/R/colocboost_check_update_jk.R @@ -584,6 +584,8 @@ estimate_change_profile_res <- function(jk, xtr <- t(X[, jk]) %*% res / (N - 1) } else if (!is.null(XtY)) { scaling_factor <- if (!is.null(N)) (N - 1) else 1 + beta_scaling <- if (!is.null(N)) 1 else 100 + beta_k <- beta_k / beta_scaling yty <- YtY / scaling_factor xtr <- res[jk] / scaling_factor xtx <- XtX # / scaling_factor diff --git a/R/colocboost_inference.R b/R/colocboost_inference.R index e89acfa..511a258 100644 --- a/R/colocboost_inference.R +++ b/R/colocboost_inference.R @@ -207,7 +207,6 @@ w_purity <- function(weights, X = NULL, Xcorr = NULL, N = NULL, n = 100, coverag return(is_pure) } - #' Function to remove the spurious signals #' @importFrom utils head tail #' @keywords cb_post_inference @@ -231,6 +230,8 @@ check_null_post <- function(cb_obj, mean((Y - X %*% as.matrix(cs_beta))^2) * N / (N - 1) } else if (!is.null(XtY)) { scaling_factor <- if (!is.null(N)) (N - 1) else 1 + beta_scaling <- if (!is.null(N)) 1 else 100 + cs_beta <- cs_beta / beta_scaling yty <- YtY / scaling_factor xtx <- XtX if (length(miss_idx) != 0) { @@ -283,14 +284,15 @@ check_null_post <- function(cb_obj, return(Y - X %*% cs_beta) } else if (!is.null(XtX)) { scaling.factor <- if (!is.null(N)) N - 1 else 1 + beta_scaling <- if (!is.null(N)) 1 else 100 xtx <- XtX / scaling.factor if (length(miss_idx) != 0) { xty <- XtY[-miss_idx] / scaling.factor res.tmp <- rep(0, length(XtY)) - res.tmp[-miss_idx] <- xty - xtx %*% cs_beta[-miss_idx] + res.tmp[-miss_idx] <- xty - xtx %*% (cs_beta[-miss_idx] / beta_scaling) } else { xty <- XtY / scaling.factor - res.tmp <- xty - xtx %*% cs_beta + res.tmp <- xty - xtx %*% (cs_beta / beta_scaling) } return(res.tmp) } @@ -476,6 +478,8 @@ get_cos_evidence <- function(cb_obj, coloc_out, data_info) { yty <- var(Y) } else if (!is.null(XtY)) { scaling_factor <- if (!is.null(N)) (N - 1) else 1 + beta_scaling <- if (!is.null(N)) 1 else 100 + cs_beta <- cs_beta / beta_scaling yty <- YtY / scaling_factor xtx <- XtX if (length(miss_idx) != 0) { diff --git a/R/colocboost_init.R b/R/colocboost_init.R index 2cbcbda..1fe75c4 100644 --- a/R/colocboost_init.R +++ b/R/colocboost_init.R @@ -41,10 +41,12 @@ colocboost_init_data <- function(X, Y, dict_YX, class(cb_data) <- "colocboost" if (!is.null(dict_YX) & !is.null(dict_sumstatLD)) { dict <- c(dict_YX, max(dict_YX) + dict_sumstatLD) + n_ind_variable <- max(dict_YX) } else if (!is.null(dict_YX) & is.null(dict_sumstatLD)) { dict <- dict_YX } else if (is.null(dict_YX) & !is.null(dict_sumstatLD)) { dict <- dict_sumstatLD + n_ind_variable <- 0 } if (focal_outcome_variables & !is.null(focal_outcome_idx)) { if (focal_outcome_idx > length(dict)) { @@ -85,7 +87,7 @@ colocboost_init_data <- function(X, Y, dict_YX, if (!is.null(Z) & !is.null(LD)) { ####################### need to consider more ######################### # ------ only code up one sumstat - variant_lists <- keep_variables[c(flag:length(keep_variables))] + variant_lists <- keep_variables[c((n_ind_variable+1):length(keep_variables))] sumstat_formated <- process_sumstat( Z, N_sumstat, Var_y, SeBhat, LD, variant_lists, dict_sumstatLD, @@ -350,6 +352,8 @@ get_correlation <- function(X = NULL, res = NULL, XtY = NULL, N = NULL, } else if (!is.null(XtY)) { corr <- rep(0, length(XtY)) scaling_factor <- if (!is.null(N)) (N - 1) else 1 + beta_scaling <- if (!is.null(N)) 1 else 100 + beta_k <- beta_k / beta_scaling YtY <- YtY / scaling_factor XtX <- XtX if (length(miss_idx) != 0) { diff --git a/R/colocboost_output.R b/R/colocboost_output.R index 05b3dd4..b347b4a 100644 --- a/R/colocboost_output.R +++ b/R/colocboost_output.R @@ -744,8 +744,14 @@ get_data_info <- function(cb_obj) { is_focal[cb_obj$cb_model_para$focal_outcome_idx] <- TRUE } is_sumstat <- grepl("sumstat_outcome", names(cb_obj$cb_data$data)) + N <- cb_obj$cb_model_para$N + check_no_N <- sapply(cb_obj$cb_model_para$N, is.null) + if (sum(check_no_N)!=0){ + N[which(check_no_N)] <- "NA" + N <- unlist(N) + } outcome_info <- data.frame( - "outcome_names" = analysis_outcome, "sample_size" = cb_obj$cb_model_para$N, + "outcome_names" = analysis_outcome, "sample_size" = N, "is_sumstats" = is_sumstat, "is_focal" = is_focal ) rownames(outcome_info) <- paste0("y", 1:n_outcome) diff --git a/R/colocboost_update.R b/R/colocboost_update.R index 9109b66..24be049 100644 --- a/R/colocboost_update.R +++ b/R/colocboost_update.R @@ -104,15 +104,16 @@ colocboost_update <- function(cb_model, cb_model_para, cb_data) { profile_log <- mean((y - x %*% beta)^2) * adj_dep } else if (!is.null(cb_data$data[[X_dict]]$XtX)) { scaling_factor <- if (!is.null(cb_data$data[[i]]$N)) cb_data$data[[i]]$N - 1 else 1 + beta_scaling <- if (!is.null(cb_data$data[[i]]$N)) 1 else 100 # - summary statistics xtx <- cb_data$data[[X_dict]]$XtX cb_model[[i]]$res <- rep(0, cb_model_para$P) if (length(cb_data$data[[i]]$variable_miss) != 0) { - beta <- cb_model[[i]]$beta[-cb_data$data[[i]]$variable_miss] + beta <- beta[-cb_data$data[[i]]$variable_miss] / beta_scaling xty <- cb_data$data[[i]]$XtY[-cb_data$data[[i]]$variable_miss] cb_model[[i]]$res[-cb_data$data[[i]]$variable_miss] <- xty - scaling_factor * xtx %*% beta } else { - beta <- cb_model[[i]]$beta + beta <- cb_model[[i]]$beta / beta_scaling xty <- cb_data$data[[i]]$XtY cb_model[[i]]$res <- xty - scaling_factor * xtx %*% beta } diff --git a/R/colocboost_workhorse.R b/R/colocboost_workhorse.R index 6303024..1269c2b 100644 --- a/R/colocboost_workhorse.R +++ b/R/colocboost_workhorse.R @@ -69,9 +69,10 @@ colocboost_workhorse <- function(cb_data, func_compare = func_compare, coloc_thresh = coloc_thresh ) - + + M_single_outcome <- 200 if (is.null(M)) { - M <- cb_model_para$L * 200 + M <- cb_model_para$L * M_single_outcome } cb_model_para$M <- M # - if multi_test value > multi_test cutoff for some outcomes, we will not update them. @@ -158,9 +159,10 @@ colocboost_workhorse <- function(cb_data, ) cb_model[[i]]$multi_correction <- multiple_testing_correction stop2 <- all(multiple_testing_correction >= cb_model[[i]]$stop_null) + stop3 <- (M_i > M_single_outcome) # -- to ensure if some outcomes do not update previously if (length(cb_model[[i]]$profile_loglike_each) >= 2) { - stop[i] <- (stop1 | stop2) # (stop2 | stop3) # (stop1 | stop2 | stop3) + stop[i] <- (stop1 | stop2 | stop3) # (stop2 | stop3) # (stop1 | stop2 | stop3) } else { stop[i] <- FALSE } diff --git a/tests/testthat/test_sumstats.R b/tests/testthat/test_sumstats.R index e69de29..30e13ad 100644 --- a/tests/testthat/test_sumstats.R +++ b/tests/testthat/test_sumstats.R @@ -0,0 +1,244 @@ +library(testthat) + +# Copy generate_test_data from test_colocboost.R +generate_test_data <- function(n = 100, p = 20, L = 2, seed = 42) { + set.seed(seed) + + # Generate X with LD structure + sigma <- matrix(0, p, p) + for (i in 1:p) { + for (j in 1:p) { + sigma[i, j] <- 0.9^abs(i - j) + } + } + X <- MASS::mvrnorm(n, rep(0, p), sigma) + colnames(X) <- paste0("SNP", 1:p) + + # Generate true effects - create a shared causal variant + true_beta <- matrix(0, p, L) + true_beta[5, 1] <- 0.5 # SNP5 affects trait 1 + true_beta[5, 2] <- 0.4 # SNP5 also affects trait 2 (colocalized) + true_beta[10, 2] <- 0.3 # SNP10 only affects trait 2 + + # Generate Y with some noise + Y <- matrix(0, n, L) + for (l in 1:L) { + Y[, l] <- X %*% true_beta[, l] + rnorm(n, 0, 1) + } + + # Prepare LD matrix + LD <- cor(X) + + # Return test objects + list( + X = X, + Y = Y, + LD = LD, + true_beta = true_beta + ) +} + +# Use the existing test data generation function +# But extend it to create summary statistics directly +generate_sumstat_test_data <- function(n = 100, p = 20, L = 2, seed = 42) { + # Get individual level data first + test_data <- generate_test_data(n, p, L, seed) + + # Generate summary statistics from individual data + X <- test_data$X + Y <- test_data$Y + LD <- test_data$LD + + # Calculate beta, se, and z-scores + beta <- matrix(0, ncol(X), ncol(Y)) + se <- matrix(0, ncol(X), ncol(Y)) + z <- matrix(0, ncol(X), ncol(Y)) + + for (i in 1:ncol(Y)) { + output = matrix(0,ncol(X),2) + for (j in 1:ncol(X)) { + fit = summary(lm(Y[,i] ~ X[,j]))$coef + if (nrow(fit) == 2) + output[j,] = as.vector(fit[2,1:2]) + else + output[j,] = c(0,0) + } + beta[,i] <- output[,1] + se[,i] <- output[,2] + z[,i] <- beta[,i]/se[,i] + } + + # Create summary statistics data frames + sumstat_list <- list() + for (i in 1:ncol(Y)) { + sumstat_list[[i]] <- data.frame( + beta = beta[,i], + sebeta = se[,i], + z = z[,i], + n = nrow(X), + variant = colnames(X) + ) + } + + # Add effect matrices for HyPrColoc format + rownames(beta) <- rownames(se) <- colnames(X) + colnames(beta) <- colnames(se) <- paste0("Y", 1:ncol(Y)) + + # Return all formats + list( + X = X, + Y = Y, + LD = LD, + sumstat = sumstat_list, + effect_est = beta, + effect_se = se, + effect_n = rep(n, ncol(Y)) + ) +} + +# Create summary statistics test data +test_sumstat_data <- generate_sumstat_test_data() + +# Test 1: Basic summary statistics input +test_that("colocboost runs with basic summary statistics format", { + # Run colocboost with sumstat and single LD matrix + result <- colocboost( + sumstat = test_sumstat_data$sumstat, + LD = test_sumstat_data$LD, + M = 10, # Small number of iterations for testing + output_level = 2 # More detailed output for testing + ) + + # Test that we get a colocboost object + expect_s3_class(result, "colocboost") + + # Check structure of results + expect_type(result$data_info, "list") + expect_type(result$model_info, "list") + + # Check dimensions + expect_equal(length(result$data_info$variables), ncol(test_sumstat_data$X)) + expect_equal(result$data_info$n_outcomes, 2) +}) + +# Test 2: HyPrColoc compatible format +test_that("colocboost runs with HyPrColoc compatible format", { + # Run colocboost with effect matrices + result <- colocboost( + effect_est = test_sumstat_data$effect_est, + effect_se = test_sumstat_data$effect_se, + effect_n = test_sumstat_data$effect_n, + LD = test_sumstat_data$LD, + M = 10, # Small number of iterations for testing + output_level = 2 # More detailed output for testing + ) + + # Test that we get a colocboost object + expect_s3_class(result, "colocboost") + + # Check dimensions + expect_equal(length(result$data_info$variables), ncol(test_sumstat_data$X)) + expect_equal(result$data_info$n_outcomes, 2) +}) + +# Test 3: Multiple LD matrices with summary statistics +test_that("colocboost runs with matched LD matrices", { + # Create a list of identical LD matrices for demonstration + LD_list <- list(test_sumstat_data$LD, test_sumstat_data$LD) + + # Run colocboost with summary statistics and multiple LD matrices + result <- colocboost( + sumstat = test_sumstat_data$sumstat, + LD = LD_list, + M = 10, # Small number of iterations for testing + output_level = 2 # More detailed output for testing + ) + + # Test that we get a colocboost object + expect_s3_class(result, "colocboost") + + # Check dimensions + expect_equal(length(result$data_info$variables), ncol(test_sumstat_data$X)) + expect_equal(result$data_info$n_outcomes, 2) +}) + +# Test 4: Arbitrary LD matrices with dictionary mapping +test_that("colocboost runs with dictionary-mapped LD matrices", { + # Create a list of identical LD matrices for demonstration + LD_list <- list(test_sumstat_data$LD, test_sumstat_data$LD) + + # Create a dictionary mapping sumstat indices to LD indices + # First sumstat uses first LD, second sumstat uses second LD + dict_sumstatLD <- matrix(c(1:2, 1:2), ncol = 2) + + # Run colocboost with dictionary mapping + result <- colocboost( + sumstat = test_sumstat_data$sumstat, + LD = LD_list, + dict_sumstatLD = dict_sumstatLD, + M = 10, # Small number of iterations for testing + output_level = 2 # More detailed output for testing + ) + + # Test that we get a colocboost object + expect_s3_class(result, "colocboost") + + # Check dimensions + expect_equal(length(result$data_info$variables), ncol(test_sumstat_data$X)) + expect_equal(result$data_info$n_outcomes, 2) +}) + +# Test 5: Missing LD matrix (LD-free approach) +test_that("colocboost runs with missing LD matrix", { + # Run colocboost without LD matrix (should default to diagonal) + result <- colocboost( + sumstat = test_sumstat_data$sumstat, + M = 1, # Only one iteration expected in LD-free case + output_level = 2 # More detailed output for testing + ) + + # Test that we get a colocboost object + expect_s3_class(result, "colocboost") + + # Check dimensions + expect_equal(length(result$data_info$variables), ncol(test_sumstat_data$X)) + expect_equal(result$data_info$n_outcomes, 2) +}) + +# Test 6: Error handling for mismatched inputs +test_that("colocboost handles mismatched inputs correctly", { + # Create mismatched effect matrices + bad_effect_est <- test_sumstat_data$effect_est[1:(nrow(test_sumstat_data$effect_est)-1), ] + + # Expect error with mismatched dimensions + expect_warning( + colocboost( + effect_est = bad_effect_est, + effect_se = test_sumstat_data$effect_se + ), + "Error: effect_est and effect_se should be the same dimension!" + ) +}) + +# Test 7: Summary statistics without sample size +test_that("colocboost handles missing sample size correctly", { + # Create summary stats without sample size + sumstat_no_n <- lapply(test_sumstat_data$sumstat, function(df) { + df$n <- NULL + return(df) + }) + + # Should run but give a warning about missing sample size + expect_warning( + result <- colocboost( + sumstat = sumstat_no_n, + LD = test_sumstat_data$LD, + M = 10, + output_level = 2 + ), + "Providing the sample size" + ) + + # Still should get a valid result + expect_s3_class(result, "colocboost") +}) \ No newline at end of file diff --git a/vignettes/Summary_Statistics_Colocalization.Rmd b/vignettes/Summary_Statistics_Colocalization.Rmd index 1ddd97e..900a9e9 100644 --- a/vignettes/Summary_Statistics_Colocalization.Rmd +++ b/vignettes/Summary_Statistics_Colocalization.Rmd @@ -51,7 +51,8 @@ Sumstat_5traits$true_effect_variants `sumstat` must include the following columns: - `z` or (`beta`, `sebeta`): either z-score or (effect size and standard error) -- `n`: sample size for the summary statistics, it is highly recommended to provide. +- `n`: sample size for the summary statistics. **Higly recommended**: Provding the sample size, or even a rough estimate of `n`, +is highly recommended. Without `n`, the implicit assumption is `n` is large (Inf) and the effect sizes are small (close to zero). - `variant`: required if `sumstat` for different outcomes do not have the same number of variables (multiple `sumstat` and multiple `LD`).