From 70d4e8b86bb778bfa64e461ab8c70b7d3f3a0dbd Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Tue, 6 May 2025 08:43:56 -0400
Subject: [PATCH 01/13] One iteration and LD free

---
 R/colocboost_assemble.R   | 10 ++++------
 R/colocboost_init.R       |  3 ++-
 R/colocboost_one_causal.R |  3 ++-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/R/colocboost_assemble.R b/R/colocboost_assemble.R
index 9ccf7cd..0a895bc 100644
--- a/R/colocboost_assemble.R
+++ b/R/colocboost_assemble.R
@@ -71,9 +71,11 @@ colocboost_assemble <- function(cb_obj,
       }
     }
   } else {
-    if (cb_obj$cb_model_para$M == 1) {
+    if (cb_obj$cb_model_para$model_used == "LD_free") {
       check_null_method <- "obj"
-      check_null <- check_null_max <- 0.1
+      check_null_max <- check_null
+    } else if (cb_obj$cb_model_para$model_used == "one_causal"){
+      check_null_max <- check_null_max * 0.1
     }
     cb_obj <- get_max_profile(cb_obj, check_null_max = check_null_max, check_null_method = check_null_method)
     # --------- about colocalized confidence sets ---------------------------------
@@ -202,10 +204,6 @@ colocboost_assemble <- function(cb_obj,
     # - colocalization results
     cb_obj$cb_model_para$weight_fudge_factor <- weight_fudge_factor
     cb_obj$cb_model_para$coverage <- coverage
-    if (cb_obj$cb_model_para$M == 1) {
-      # fixme for single iteration model
-      cb_obj <- get_max_profile(cb_obj, check_null_max = 0.01, check_null_method = "profile")
-    }
     cos_results <- get_cos_details(cb_obj, coloc_out = past_out$cos$cos, data_info = data_info)
     cb_output <- list(
       "vcp" = cos_results$vcp,
diff --git a/R/colocboost_init.R b/R/colocboost_init.R
index 76c6ceb..00a0b24 100644
--- a/R/colocboost_init.R
+++ b/R/colocboost_init.R
@@ -299,7 +299,8 @@ colocboost_init_para <- function(cb_data, cb_model, tau = 0.01,
     "num_updates_outcome" = num_updates_outcome,
     "func_multi_test" = func_multi_test,
     "multi_test_thresh" = multi_test_thresh,
-    "multi_test_max" = multi_test_max
+    "multi_test_max" = multi_test_max,
+    "model_used" = "original"
   )
   class(cb_model_para) <- "colocboost"
 
diff --git a/R/colocboost_one_causal.R b/R/colocboost_one_causal.R
index 0d70f1d..0ed96b5 100644
--- a/R/colocboost_one_causal.R
+++ b/R/colocboost_one_causal.R
@@ -17,9 +17,10 @@ colocboost_one_causal <- function(cb_model, cb_model_para, cb_data) {
   
   if (cb_model_para$jk_equiv_corr != 0) {
     cb_obj <- colocboost_one_iteration(cb_model, cb_model_para, cb_data)
-    
+    cb_obj$cb_model_para$model_used <- "one_causal"
   } else {
     cb_obj <- colocboost_diagLD(cb_model, cb_model_para, cb_data)
+    cb_obj$cb_model_para$model_used <- "LD_free"
   }
   return(cb_obj)
 }

From b51265eefde9914fe4010c645a255bf9b388cc93 Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Sat, 10 May 2025 11:43:23 -0400
Subject: [PATCH 02/13] Update colocboost.R

checking missing values in LD matrix:
- will cause an error with 'var_r' is not a logic value
---
 R/colocboost.R | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/R/colocboost.R b/R/colocboost.R
index 4c5f651..4ce07ea 100644
--- a/R/colocboost.R
+++ b/R/colocboost.R
@@ -428,6 +428,12 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
       if (is.matrix(LD)) {
         LD <- list(LD)
       }
+      # - check if NA in LD matrix
+      num_na <- sapply(LD, sum)
+      if (any(is.na(num_na))){
+        warning("Error: Input LD must not contain missing values (NA).")
+        return(NULL)
+      }
       if (length(LD) == 1) {
         sumstatLD_dict <- rep(1, length(sumstat))
       } else if (length(LD) == length(sumstat)) {

From 2a970216fceca3865c69e2601bf64d64ef217253 Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Thu, 15 May 2025 18:05:28 -0500
Subject: [PATCH 03/13] optimization for LD free version code

remove the construction of the diag LD matrix to save the memory.
---
 R/colocboost.R                 | 36 +++++++++++++++++--------------
 R/colocboost_check_update_jk.R | 17 ++++++++++-----
 R/colocboost_inference.R       | 31 ++++++++++++++++++++-------
 R/colocboost_init.R            | 39 ++++++++++++++++++++--------------
 R/colocboost_update.R          | 26 ++++++++++++++++++-----
 5 files changed, 99 insertions(+), 50 deletions(-)

diff --git a/R/colocboost.R b/R/colocboost.R
index 4ce07ea..0227d36 100644
--- a/R/colocboost.R
+++ b/R/colocboost.R
@@ -398,22 +398,26 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
         "Without LD, only a single iteration will be performed under the assumption of one causal variable per outcome. ",
         "Additionally, the purity of CoS cannot be evaluated!"
       )
-
-      p.sumstat <- sapply(keep_variable_sumstat, length)
-      p.unique <- unique(p.sumstat)
-      if (length(p.unique) == 1) {
-        ld <- diag(1, nrow = p.unique)
-        colnames(ld) <- rownames(ld) <- keep_variable_sumstat[[1]]
-        LD <- list(ld)
-        sumstatLD_dict <- rep(1, length(sumstat))
-      } else {
-        LD <- lapply(keep_variable_sumstat, function(sn) {
-          ld <- diag(1, nrow = length(sn))
-          colnames(ld) <- rownames(ld) <- sn
-          return(ld)
-        })
-        sumstatLD_dict <- 1:length(sumstat)
-      }
+      
+      LD <- 1
+      sumstatLD_dict <- rep(1, length(sumstat))
+      
+# 
+#       p.sumstat <- sapply(keep_variable_sumstat, length)
+#       p.unique <- unique(p.sumstat)
+#       if (length(p.unique) == 1) {
+#         ld <- diag(1, nrow = p.unique)
+#         colnames(ld) <- rownames(ld) <- keep_variable_sumstat[[1]]
+#         LD <- list(ld)
+#         sumstatLD_dict <- rep(1, length(sumstat))
+#       } else {
+#         LD <- lapply(keep_variable_sumstat, function(sn) {
+#           ld <- diag(1, nrow = length(sn))
+#           colnames(ld) <- rownames(ld) <- sn
+#           return(ld)
+#         })
+#         sumstatLD_dict <- 1:length(sumstat)
+#       }
 
       # change some algorithm parameters
       M <- 1 # one iteration
diff --git a/R/colocboost_check_update_jk.R b/R/colocboost_check_update_jk.R
index 9e78c14..9fbcdb2 100644
--- a/R/colocboost_check_update_jk.R
+++ b/R/colocboost_check_update_jk.R
@@ -498,10 +498,13 @@ get_LD_jk1_jk2 <- function(jk1, jk2,
     LD_temp[which(is.na(LD_temp))] <- 0
     LD_temp <- LD_temp[1, 2]
   } else if (!is.null(XtX)) {
-    jk1.remain <- which(remain_jk == jk1)
-    jk2.remain <- which(remain_jk == jk2)
-    # scaling <- if (!is.null(N)) N-1 else 1
-    LD_temp <- XtX[jk1.remain, jk2.remain] # / scaling
+    if (sum(XtX) == 1){
+      LD_temp <- 0
+    } else {
+      jk1.remain <- which(remain_jk == jk1)
+      jk2.remain <- which(remain_jk == jk2)
+      LD_temp <- XtX[jk1.remain, jk2.remain]
+    }
   }
   return(LD_temp)
 }
@@ -595,7 +598,11 @@ estimate_change_profile_res <- function(jk,
     } else {
       xty <- XtY / scaling_factor
     }
-    rtr <- yty - 2 * sum(beta_k * xty) + sum((xtx %*% as.matrix(beta_k)) * beta_k)
+    if (sum(xtx) == 1){
+      rtr <- yty - 2 * sum(beta_k * xty) + sum(beta_k^2)
+    } else {
+      rtr <- yty - 2 * sum(beta_k * xty) + sum((xtx %*% as.matrix(beta_k)) * beta_k)
+    }
   }
   numerator <- xtr^2 / (2 * rtr)
   denominator <- 0.5 * log(2 * pi * rtr) + 0.5
diff --git a/R/colocboost_inference.R b/R/colocboost_inference.R
index bc709b5..5fd2174 100644
--- a/R/colocboost_inference.R
+++ b/R/colocboost_inference.R
@@ -240,8 +240,11 @@ check_null_post <- function(cb_obj,
       } else {
         xty <- XtY / scaling_factor
       }
-
-      (yty - 2 * sum(cs_beta * xty) + sum((xtx %*% as.matrix(cs_beta)) * cs_beta)) * adj_dep
+      if (sum(xtx) == 1){
+        (yty - 2 * sum(cs_beta * xty) + sum(cs_beta^2)) * adj_dep
+      } else {
+        (yty - 2 * sum(cs_beta * xty) + sum((xtx %*% as.matrix(cs_beta)) * cs_beta)) * adj_dep
+      }
     }
   }
 
@@ -289,10 +292,18 @@ check_null_post <- function(cb_obj,
       if (length(miss_idx) != 0) {
         xty <- XtY[-miss_idx] / scaling.factor
         res.tmp <- rep(0, length(XtY))
-        res.tmp[-miss_idx] <- xty - xtx %*% (cs_beta[-miss_idx] / beta_scaling)
+        if (sum(xtx) == 1){
+          res.tmp[-miss_idx] <- xty - cs_beta[-miss_idx] / beta_scaling
+        } else {
+          res.tmp[-miss_idx] <- xty - xtx %*% (cs_beta[-miss_idx] / beta_scaling)
+        }
       } else {
         xty <- XtY / scaling.factor
-        res.tmp <- xty - xtx %*% (cs_beta / beta_scaling)
+        if (sum(xtx) == 1){
+          res.tmp <- xty - (cs_beta / beta_scaling)
+        } else {
+          res.tmp <- xty - xtx %*% (cs_beta / beta_scaling)
+        }
       }
       return(res.tmp)
     }
@@ -346,11 +357,12 @@ check_null_post <- function(cb_obj,
           YtY = cb_data$data[[X_dict]]$YtY,
           miss_idx = cb_data$data[[j]]$variable_miss
         )
-        last_obj <- min(cb_obj$cb_model[[j]]$obj_path)
-        change <- abs(cs_obj - last_obj)
         if (length(cb_obj$cb_model[[j]]$obj_path) == 1) {
           total_obj <- 1
+          change <- cs_obj
         } else {
+          last_obj <- min(cb_obj$cb_model[[j]]$obj_path)
+          change <- abs(cs_obj - last_obj)
           total_obj <- diff(range(cb_obj$cb_model[[j]]$obj_path))
         }
         check_cs_change[i, j] <- change / total_obj
@@ -488,8 +500,11 @@ get_cos_evidence <- function(cb_obj, coloc_out, data_info) {
       } else {
         xty <- XtY / scaling_factor
       }
-
-      cos_profile <- (yty - 2 * sum(cs_beta * xty) + sum((xtx %*% as.matrix(cs_beta)) * cs_beta)) * adj_dep
+      if (sum(xtx) == 1){
+        cos_profile <- (yty - 2 * sum(cs_beta * xty) + sum(cs_beta^2)) * adj_dep
+      } else {
+        cos_profile <- (yty - 2 * sum(cs_beta * xty) + sum((xtx %*% as.matrix(cs_beta)) * cs_beta)) * adj_dep
+      }
     }
     delta <- yty - cos_profile
     if (delta <= 0) {
diff --git a/R/colocboost_init.R b/R/colocboost_init.R
index 00a0b24..07de832 100644
--- a/R/colocboost_init.R
+++ b/R/colocboost_init.R
@@ -375,7 +375,11 @@ get_correlation <- function(X = NULL, res = NULL, XtY = NULL, N = NULL,
       Xtr <- res / scaling_factor
       XtY <- XtY / scaling_factor
     }
-    var_r <- YtY - 2 * sum(beta_k * XtY) + sum((XtX %*% as.matrix(beta_k)) * beta_k)
+    if (sum(XtX) == 1){
+      var_r <- YtY - 2 * sum(beta_k * XtY) + sum(beta_k^2)
+    } else {
+      var_r <- YtY - 2 * sum(beta_k * XtY) + sum((XtX %*% as.matrix(beta_k)) * beta_k)
+    }
     if (var_r > 1e-6) {
       corr_nomiss <- Xtr / sqrt(var_r)
       if (length(miss_idx) != 0) {
@@ -574,21 +578,24 @@ process_sumstat <- function(Z, N, Var_y, SeBhat, ld_matrices, variant_lists, dic
     Z_extend[pos_target] <- current_z[pos_z]
 
     # Calculate submatrix for each unique entry (not duplicates)
-    ld_submatrix <- NULL
-
-    if (length(common_variants) > 0) {
-      # Only include the submatrix if this entry is unique or is the first occurrence
-      if (i == unified_dict[i]) {
-        # Check if common_variants and rownames have identical order
-        if (identical(common_variants, rownames(current_ld_matrix))) {
-          # If order is identical, use the matrix directly without reordering
-          ld_submatrix <- current_ld_matrix
-        } else {
-          # If order is different, reorder using matched indices
-          matched_indices <- match(common_variants, rownames(current_ld_matrix))
-          ld_submatrix <- current_ld_matrix[matched_indices, matched_indices, drop = FALSE]
-          rownames(ld_submatrix) <- common_variants
-          colnames(ld_submatrix) <- common_variants
+    if (sum(current_ld_matrix) == 1){
+      ld_submatrix <- current_ld_matrix
+    } else {
+      ld_submatrix <- NULL
+      if (length(common_variants) > 0) {
+        # Only include the submatrix if this entry is unique or is the first occurrence
+        if (i == unified_dict[i]) {
+          # Check if common_variants and rownames have identical order
+          if (identical(common_variants, rownames(current_ld_matrix))) {
+            # If order is identical, use the matrix directly without reordering
+            ld_submatrix <- current_ld_matrix
+          } else {
+            # If order is different, reorder using matched indices
+            matched_indices <- match(common_variants, rownames(current_ld_matrix))
+            ld_submatrix <- current_ld_matrix[matched_indices, matched_indices, drop = FALSE]
+            rownames(ld_submatrix) <- common_variants
+            colnames(ld_submatrix) <- common_variants
+          }
         }
       }
     }
diff --git a/R/colocboost_update.R b/R/colocboost_update.R
index 21b1bb5..a75296d 100644
--- a/R/colocboost_update.R
+++ b/R/colocboost_update.R
@@ -111,16 +111,29 @@ colocboost_update <- function(cb_model, cb_model_para, cb_data) {
       if (length(cb_data$data[[i]]$variable_miss) != 0) {
         beta <- cb_model[[i]]$beta[-cb_data$data[[i]]$variable_miss]  / beta_scaling
         xty <- cb_data$data[[i]]$XtY[-cb_data$data[[i]]$variable_miss]
-        cb_model[[i]]$res[-cb_data$data[[i]]$variable_miss] <- xty - scaling_factor * xtx %*% beta
+        if (sum(xtx) == 1){
+          cb_model[[i]]$res[-cb_data$data[[i]]$variable_miss] <- xty - scaling_factor * beta
+        } else {
+          cb_model[[i]]$res[-cb_data$data[[i]]$variable_miss] <- xty - scaling_factor * xtx %*% beta
+        }
+        
       } else {
         beta <- cb_model[[i]]$beta / beta_scaling
         xty <- cb_data$data[[i]]$XtY
-        cb_model[[i]]$res <- xty - scaling_factor * xtx %*% beta
+        if (sum(xtx) == 1){
+          cb_model[[i]]$res <- xty - scaling_factor * beta
+        } else {
+          cb_model[[i]]$res <- xty - scaling_factor * xtx %*% beta
+        }
       }
       # - profile-loglikelihood
       yty <- cb_data$data[[i]]$YtY / scaling_factor
       xty <- xty / scaling_factor
-      profile_log <- (yty - 2 * sum(beta * xty) + sum((xtx %*% as.matrix(beta)) * beta)) * adj_dep
+      if (sum(xtx) == 1){
+        profile_log <- (yty - 2 * sum(beta * xty) + sum(beta^2)) * adj_dep
+      } else {
+        profile_log <- (yty - 2 * sum(beta * xty) + sum((xtx %*% as.matrix(beta)) * beta)) * adj_dep
+      }
     }
     cb_model[[i]]$profile_loglike_each <- c(cb_model[[i]]$profile_loglike_each, profile_log)
   }
@@ -137,10 +150,13 @@ get_LD_jk <- function(jk1, X = NULL, XtX = NULL, N = NULL, remain_idx = NULL, P
     })
     corr[which(is.na(corr))] <- 0
   } else if (!is.null(XtX)) {
-    # scaling <- if (!is.null(N)) N-1 else 1
     jk1.remain <- which(remain_idx == jk1)
     corr <- rep(0, P)
-    corr[remain_idx] <- XtX[, jk1.remain]
+    if (sum(XtX) == 1){
+      corr[remain_idx] <- 1
+    } else {
+      corr[remain_idx] <- XtX[, jk1.remain]
+    }
   }
   corr
 }

From 50ceb00e52943561c007f3395f46d60ec50962e9 Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Thu, 15 May 2025 19:04:24 -0500
Subject: [PATCH 04/13] fix error

---
 R/colocboost_assemble.R  |  6 ++++--
 R/colocboost_inference.R | 31 +++++++++++++++++++------------
 2 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/R/colocboost_assemble.R b/R/colocboost_assemble.R
index 0a895bc..74f9efa 100644
--- a/R/colocboost_assemble.R
+++ b/R/colocboost_assemble.R
@@ -72,10 +72,12 @@ colocboost_assemble <- function(cb_obj,
     }
   } else {
     if (cb_obj$cb_model_para$model_used == "LD_free") {
+      # fixme later
       check_null_method <- "obj"
-      check_null_max <- check_null
+      check_null_max <- check_null * check_null
     } else if (cb_obj$cb_model_para$model_used == "one_causal"){
-      check_null_max <- check_null_max * 0.1
+      # fixme later
+      check_null_max <- check_null_max * check_null
     }
     cb_obj <- get_max_profile(cb_obj, check_null_max = check_null_max, check_null_method = check_null_method)
     # --------- about colocalized confidence sets ---------------------------------
diff --git a/R/colocboost_inference.R b/R/colocboost_inference.R
index 5fd2174..ad5531b 100644
--- a/R/colocboost_inference.R
+++ b/R/colocboost_inference.R
@@ -357,12 +357,11 @@ check_null_post <- function(cb_obj,
           YtY = cb_data$data[[X_dict]]$YtY,
           miss_idx = cb_data$data[[j]]$variable_miss
         )
+        last_obj <- min(cb_obj$cb_model[[j]]$obj_path)
+        change <- abs(cs_obj - last_obj)
         if (length(cb_obj$cb_model[[j]]$obj_path) == 1) {
           total_obj <- 1
-          change <- cs_obj
         } else {
-          last_obj <- min(cb_obj$cb_model[[j]]$obj_path)
-          change <- abs(cs_obj - last_obj)
           total_obj <- diff(range(cb_obj$cb_model[[j]]$obj_path))
         }
         check_cs_change[i, j] <- change / total_obj
@@ -420,8 +419,12 @@ get_purity <- function(pos, X = NULL, Xcorr = NULL, N = NULL, n = 100) {
       corr[which(is.na(corr))] <- 0
       value <- abs(get_upper_tri(corr))
     } else {
-      Xcorr <- Xcorr # if (!is.null(N)) Xcorr/(N-1) else Xcorr
-      value <- abs(get_upper_tri(Xcorr[pos, pos]))
+      if (sum(Xcorr) == 1){
+        value <- 0
+      } else {
+        Xcorr <- Xcorr # if (!is.null(N)) Xcorr/(N-1) else Xcorr
+        value <- abs(get_upper_tri(Xcorr[pos, pos]))
+      }
     }
     return(c(
       min(value),
@@ -455,14 +458,18 @@ get_between_purity <- function(pos1, pos2, X = NULL, Xcorr = NULL, miss_idx = NU
     X_sub2 <- scale(X[, pos2, drop = FALSE], center = T, scale = F)
     value <- abs(get_matrix_mult(X_sub1, X_sub2))
   } else {
-    if (length(miss_idx)!=0){
-      pos1 <- na.omit(match(pos1, setdiff(1:P, miss_idx)))
-      pos2 <- na.omit(match(pos2, setdiff(1:P, miss_idx)))
-    }
-    if (length(pos1) != 0 & length(pos2) != 0) {
-      value <- abs(Xcorr[pos1, pos2])
-    } else {
+    if (sum(Xcorr)==1){
       value <- 0
+    } else {
+      if (length(miss_idx)!=0){
+        pos1 <- na.omit(match(pos1, setdiff(1:P, miss_idx)))
+        pos2 <- na.omit(match(pos2, setdiff(1:P, miss_idx)))
+      }
+      if (length(pos1) != 0 & length(pos2) != 0) {
+        value <- abs(Xcorr[pos1, pos2])
+      } else {
+        value <- 0
+      }
     }
   }
   return(c(min(value), max(value), get_median(value)))

From ca793cfa2851394a86897d85822df643fb659c3c Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Mon, 19 May 2025 20:14:29 -0400
Subject: [PATCH 05/13] minor fix

---
 R/colocboost.R           | 1 +
 R/colocboost_inference.R | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/R/colocboost.R b/R/colocboost.R
index 0227d36..51a6774 100644
--- a/R/colocboost.R
+++ b/R/colocboost.R
@@ -642,3 +642,4 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
   class(cb_output) <- "colocboost"
   return(cb_output)
 }
+
diff --git a/R/colocboost_inference.R b/R/colocboost_inference.R
index ad5531b..5226db0 100644
--- a/R/colocboost_inference.R
+++ b/R/colocboost_inference.R
@@ -207,6 +207,7 @@ w_purity <- function(weights, X = NULL, Xcorr = NULL, N = NULL, n = 100, coverag
   return(is_pure)
 }
 
+
 #' Function to remove the spurious signals
 #' @importFrom utils head tail
 #' @keywords cb_post_inference
@@ -360,7 +361,7 @@ check_null_post <- function(cb_obj,
         last_obj <- min(cb_obj$cb_model[[j]]$obj_path)
         change <- abs(cs_obj - last_obj)
         if (length(cb_obj$cb_model[[j]]$obj_path) == 1) {
-          total_obj <- 1
+          total_obj <- change
         } else {
           total_obj <- diff(range(cb_obj$cb_model[[j]]$obj_path))
         }

From 429ed65e90c4e8b482ad9d6663d6647a0cb5f5c5 Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Sat, 24 May 2025 21:04:55 -0400
Subject: [PATCH 06/13] Update colocboost_output.R

fix error for get_cos_summary
---
 R/colocboost_output.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/colocboost_output.R b/R/colocboost_output.R
index d22b6e0..87617e8 100644
--- a/R/colocboost_output.R
+++ b/R/colocboost_output.R
@@ -838,7 +838,7 @@ get_ucos_summary <- function(cb_output, outcome_names = NULL, region_name = NULL
     "overlap_idx", "overlap_variables", "n_recalibrated_variables",
     "recalibrated_index", "recalibrated_variables", "recalibrated_variables_vcp"
   )
-
+  ambiguous_summary <- as.data.frame(ambiguous_summary)
   ambiguous_summary[, 1] <- unlist(sapply(ambiguous_results, function(tmp) paste0(tmp$ambiguous_cos_outcomes$outcome_name, collapse = "; ")))
   ambiguous_summary[, 2] <- names(ambiguous_results)
   ambiguous_summary[, 3] <- sapply(ambiguous_results, function(tmp) max(tmp$ambigous_cos_purity$min_abs_cor[lower.tri(tmp$ambigous_cos_purity$min_abs_cor)]) )

From 8c60381cf97ca136d7969625481e7ce9973de593 Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Tue, 8 Jul 2025 13:05:06 -0400
Subject: [PATCH 07/13] Update colocboost_init.R

slient the warning message from qvalue function, and if warning happens, try another qvalue estimation of pi0
---
 R/colocboost_init.R | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/R/colocboost_init.R b/R/colocboost_init.R
index 07de832..e08929e 100644
--- a/R/colocboost_init.R
+++ b/R/colocboost_init.R
@@ -430,15 +430,35 @@ get_lfdr <- function(z, miss_idx = NULL) {
     z <- z[-miss_idx]
     try_run <- 1
     while (try_run == 1 && lambda_max >= 0.05) {
-      result <- try(
+      # result <- try(
+      #   {
+      #     lfdr_nomissing <- qvalue(pchisq(drop(z^2), 1, lower.tail = FALSE), lambda = seq(0.05, lambda_max, 0.05))$lfdr
+      #   },
+      #   silent = TRUE
+      # )
+      # if (inherits(result, "try-error")) {
+      #   lambda_max <- lambda_max - 0.05 # Decrement lambda_max if error occurs
+      # } else {
+      #   try_run <- 0
+      # }
+      result <- tryCatch(
         {
-          lfdr_nomissing <- qvalue(pchisq(drop(z^2), 1, lower.tail = FALSE), lambda = seq(0.05, lambda_max, 0.05))$lfdr
+          lfdr_nomissing <- qvalue(pchisq(drop(z^2), 1, lower.tail = FALSE), 
+                                   lambda = seq(0.05, lambda_max, 0.05))$lfdr
+          list(success = TRUE, value = lfdr_nomissing)
         },
-        silent = TRUE
+        warning = function(w) {
+          list(success = FALSE, message = w$message)
+        },
+        error = function(e) {
+          list(success = FALSE, message = e$message)
+        }
       )
-      if (inherits(result, "try-error")) {
-        lambda_max <- lambda_max - 0.05 # Decrement lambda_max if error occurs
+      
+      if (!result$success) {
+        lambda_max <- lambda_max - 0.05
       } else {
+        lfdr_nomissing <- result$value
         try_run <- 0
       }
     }

From f7b0a25fa2bd0ed18f6f88d2720387a1669c1845 Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Fri, 11 Jul 2025 23:21:35 -0400
Subject: [PATCH 08/13] issue about focal outcome index

The issue: if we have multiple sumstat and only one superset LD matrix, the index of focal outcome will be mismatched.

1. dict is designed by X and LD, but we need additional dictionary for keep variables.
2. keep variables for individual level data is based on X - there is no issue.
3. keep variables for sumstat data is based on sumstat (not LD) - there is a issue to index the focal outcome based on dict later.

Keeping and commenting out the original code for initial testing, later we will remove the commented code if pass the test

Additionally:
1. adding unit test for the above issue.
2. debuging the data loader error.
---
 R/colocboost_init.R                |  25 ++++-
 tests/testthat/test_corner_cases.R |   2 +-
 tests/testthat/test_inference.R    |   4 +-
 tests/testthat/test_model.R        | 166 +++++++++++++++++++++++++++++
 4 files changed, 192 insertions(+), 5 deletions(-)

diff --git a/R/colocboost_init.R b/R/colocboost_init.R
index e08929e..bd278ab 100644
--- a/R/colocboost_init.R
+++ b/R/colocboost_init.R
@@ -39,22 +39,43 @@ colocboost_init_data <- function(X, Y, dict_YX,
   #################  initialization #######################################
   cb_data <- list("data" = list())
   class(cb_data) <- "colocboost"
+  # if (!is.null(dict_YX) & !is.null(dict_sumstatLD)) {
+  #   dict <- c(dict_YX, max(dict_YX) + dict_sumstatLD)
+  #   n_ind_variable <- max(dict_YX)
+  # } else if (!is.null(dict_YX) & is.null(dict_sumstatLD)) {
+  #   dict <- dict_YX
+  # } else if (is.null(dict_YX) & !is.null(dict_sumstatLD)) {
+  #   dict <- dict_sumstatLD
+  #   n_ind_variable <- 0
+  # }
+  
+  # note here and remove later: dict is designed by X and LD, but we need additional dictionary for keep variables.
+  # keep variables for individual level data is based on X - there is no issue.
+  # keep variables for sumstat data is based on sumstat (not LD) - there is a issue to index the focal outcome based on dict later.
   if (!is.null(dict_YX) & !is.null(dict_sumstatLD)) {
     dict <- c(dict_YX, max(dict_YX) + dict_sumstatLD)
     n_ind_variable <- max(dict_YX)
+    dict_keep_variables <- c(dict_YX, 1:length(dict_sumstatLD) + n_ind_variable)
   } else if (!is.null(dict_YX) & is.null(dict_sumstatLD)) {
     dict <- dict_YX
+    dict_keep_variables <- dict_YX
   } else if (is.null(dict_YX) & !is.null(dict_sumstatLD)) {
     dict <- dict_sumstatLD
     n_ind_variable <- 0
+    dict_keep_variables <- 1:length(dict_sumstatLD)
   }
+  
   if (focal_outcome_variables & !is.null(focal_outcome_idx)) {
     if (focal_outcome_idx > length(dict)) {
       stop("Target outcome index is over the total number of outcomes! please check!")
     }
-    keep_variable_names <- keep_variables[[dict[focal_outcome_idx]]]
+    # keep_variable_names <- keep_variables[[dict[focal_outcome_idx]]]
+    keep_variable_names <- keep_variables[[dict_keep_variables[focal_outcome_idx]]]
     if (overlap_variables) {
-      keep_tmp <- lapply(keep_variables[-dict[focal_outcome_idx]], function(tm) {
+      # keep_tmp <- lapply(keep_variables[-dict[focal_outcome_idx]], function(tm) {
+      #   intersect(keep_variable_names, tm)
+      # })
+      keep_tmp <- lapply(keep_variables[-dict_keep_variables[focal_outcome_idx]], function(tm) {
         intersect(keep_variable_names, tm)
       })
       keep_variable_names <- Reduce(union, keep_tmp)
diff --git a/tests/testthat/test_corner_cases.R b/tests/testthat/test_corner_cases.R
index d499159..66938a5 100644
--- a/tests/testthat/test_corner_cases.R
+++ b/tests/testthat/test_corner_cases.R
@@ -340,7 +340,7 @@ test_that("colocboost prioritizes focal outcome correctly", {
 # Test with ambiguous corner cases
 test_that("get_ambiguous_colocalization handles edge cases with correlation thresholds", {
 
-  data(Ambiguous_Colocalization)
+  data("Ambiguous_Colocalization", package = "colocboost", envir = environment())
   test_colocboost_results <- Ambiguous_Colocalization$ColocBoost_Results
   
   # Test with very high correlation thresholds (should find fewer ambiguities)
diff --git a/tests/testthat/test_inference.R b/tests/testthat/test_inference.R
index 64e74f3..0c1acd7 100644
--- a/tests/testthat/test_inference.R
+++ b/tests/testthat/test_inference.R
@@ -191,7 +191,7 @@ test_that("get_hierarchical_clusters functions correctly", {
 test_that("get_ambiguous_colocalization identifies ambiguous colocalizations correctly", {
   # The function expects a specialized test dataset that has ambiguous colocalizations
   # There's a reference in the example to a dataset named "Ambiguous_Colocalization"
-  data(Ambiguous_Colocalization)
+  data("Ambiguous_Colocalization", package = "colocboost", envir = environment())
   test_colocboost_results <- Ambiguous_Colocalization$ColocBoost_Results
   
   # Basic call with default parameters
@@ -256,7 +256,7 @@ test_that("get_ambiguous_colocalization identifies ambiguous colocalizations cor
 test_that("get_ucos_summary funtionality", {
   # The function expects a specialized test dataset that has ambiguous colocalizations
   # There's a reference in the example to a dataset named "Ambiguous_Colocalization"
-  data(Ambiguous_Colocalization)
+  data("Ambiguous_Colocalization", package = "colocboost", envir = environment())
   test_colocboost_results <- Ambiguous_Colocalization$ColocBoost_Results
   
   # Basic call with default parameters
diff --git a/tests/testthat/test_model.R b/tests/testthat/test_model.R
index 8a8848f..df998e8 100644
--- a/tests/testthat/test_model.R
+++ b/tests/testthat/test_model.R
@@ -89,6 +89,172 @@ test_that("colocboost_init_data correctly initializes data", {
   }
 })
 
+# Test for dict_keep_variables mapping issue
+test_that("colocboost correctly maps focal outcome to keep_variables with dict_keep_variables", {
+  
+  # Test Case 1: Mixed individual and summary statistics data
+  # 5 Y outcomes with 1 superset X + 5 sumstats with 1 superset LD
+  set.seed(42)
+  n <- 100
+  p_x <- 300  # X matrix has 300 variables
+  p_sumstat <- 500  # 5th sumstat has 500 variables, others have 300
+  
+  # Generate superset X matrix
+  X_superset <- matrix(rnorm(n * p_x), n, p_x)
+  colnames(X_superset) <- paste0("SNP_X_", 1:p_x)
+  
+  # Generate Y outcomes (all use the same superset X)
+  Y_list <- list()
+  for (i in 1:5) {
+    Y_list[[i]] <- matrix(rnorm(n), n, 1)
+  }
+  
+  # Generate summary statistics
+  # First 4 sumstats have 300 variables, 5th has 500 variables
+  sumstat_list <- list()
+  for (i in 1:4) {
+    sumstat_list[[i]] <- data.frame(
+      beta = rnorm(p_x),
+      sebeta = abs(rnorm(p_x, 0, 0.1)),
+      n = n,
+      variant = paste0("SNP_S_", 1:p_x)
+    )
+  }
+  
+  # 5th sumstat has 500 variables
+  sumstat_list[[5]] <- data.frame(
+    beta = rnorm(p_sumstat),
+    sebeta = abs(rnorm(p_sumstat, 0, 0.1)),
+    n = n,
+    variant = paste0("SNP_S_", 1:p_sumstat)
+  )
+  
+  # Create superset LD matrix for summary statistics
+  LD_superset <- diag(p_sumstat)  # Use diagonal for simplicity
+  colnames(LD_superset) <- rownames(LD_superset) <- paste0("SNP_S_", 1:p_sumstat)
+  
+  # Set up dictionaries
+  dict_YX <- c(1, 1, 1, 1, 1)  # All 5 Y outcomes use the same X (index 1)
+  dict_sumstatLD <- c(1, 1, 1, 1, 1)  # All 5 sumstats use the same LD (index 1)
+  
+  # Expected keep_variables structure:
+  # Index 1: Variables from X_superset (300 variables)
+  # Index 2: Variables from sumstat 1 (300 variables) 
+  # Index 3: Variables from sumstat 2 (300 variables)
+  # Index 4: Variables from sumstat 3 (300 variables)
+  # Index 5: Variables from sumstat 4 (300 variables)
+  # Index 6: Variables from sumstat 5 (500 variables)
+  keep_variables <- list(
+    colnames(X_superset),                    # Index 1: 300 variables
+    sumstat_list[[1]]$variant,              # Index 2: 300 variables
+    sumstat_list[[2]]$variant,              # Index 3: 300 variables
+    sumstat_list[[3]]$variant,              # Index 4: 300 variables
+    sumstat_list[[4]]$variant,              # Index 5: 300 variables
+    sumstat_list[[5]]$variant               # Index 6: 500 variables
+  )
+  
+  # Test the dictionary creation logic
+  if (!is.null(dict_YX) & !is.null(dict_sumstatLD)) {
+    dict <- c(dict_YX, max(dict_YX) + dict_sumstatLD)
+    n_ind_variable <- max(dict_YX)
+    # Create dict_keep_variables: first part maps to X matrices, second part maps to sumstat indices
+    dict_keep_variables <- c(dict_YX, 1:length(dict_sumstatLD) + n_ind_variable)
+  }
+  
+  # Expected results:
+  # dict = c(1,1,1,1,1,2,2,2,2,2) (maps to data matrices)
+  # dict_keep_variables = c(1,1,1,1,1,2,3,4,5,6) (maps to keep_variables indices)
+  
+  expect_equal(dict, c(1,1,1,1,1,2,2,2,2,2))
+  expect_equal(dict_keep_variables, c(1,1,1,1,1,2,3,4,5,6))
+  
+  # Test focal outcome mapping
+  focal_outcome_idx <- 10  # 5th sumstat (the one with 500 variables)
+  focal_outcome_variables <- TRUE
+  overlap_variables <- FALSE
+  
+  if (focal_outcome_variables & !is.null(focal_outcome_idx)) {
+    if (focal_outcome_idx > length(dict)) {
+      stop("Target outcome index is over the total number of outcomes! please check!")
+    }
+    
+    # Use dict_keep_variables to get the correct keep_variables index
+    keep_variable_names <- keep_variables[[dict_keep_variables[focal_outcome_idx]]]
+    
+    # The focal outcome (index 10) should map to keep_variables index 6 (500 variables)
+    expect_equal(dict_keep_variables[focal_outcome_idx], 6)
+    expect_equal(length(keep_variable_names), 500)
+    expect_equal(keep_variable_names, sumstat_list[[5]]$variant)
+  }
+  
+  # Test Case 2: Only summary statistics
+  dict_YX_null <- NULL
+  dict_sumstatLD_only <- c(1,1,1,1,1)
+  
+  if (is.null(dict_YX_null) & !is.null(dict_sumstatLD_only)) {
+    dict_only_sumstat <- dict_sumstatLD_only
+    n_ind_variable_only <- 0
+    # Only summary statistics, so dict_keep_variables maps directly to sumstat indices
+    dict_keep_variables_only <- 1:length(dict_sumstatLD_only)
+  }
+  
+  expect_equal(dict_keep_variables_only, c(1,2,3,4,5))
+  
+  # Test focal outcome mapping for sumstat-only case
+  focal_outcome_idx_only <- 5  # 5th sumstat
+  keep_variable_names_only <- keep_variables[[dict_keep_variables_only[focal_outcome_idx_only] + 1]]  # +1 because no X matrix
+  
+  expect_equal(dict_keep_variables_only[focal_outcome_idx_only], 5)
+  expect_equal(length(keep_variable_names_only), 500)
+  
+  # Test Case 3: Only individual level data
+  dict_YX_only <- c(1,1,1,1,1)
+  dict_sumstatLD_null <- NULL
+  
+  if (!is.null(dict_YX_only) & is.null(dict_sumstatLD_null)) {
+    dict_only_ind <- dict_YX_only
+    # Only individual level data, so dict_keep_variables is the same as dict_YX
+    dict_keep_variables_only_ind <- dict_YX_only
+  }
+  
+  expect_equal(dict_keep_variables_only_ind, c(1,1,1,1,1))
+  
+  # All outcomes should map to the same X matrix (index 1)
+  focal_outcome_idx_ind <- 3
+  expect_equal(dict_keep_variables_only_ind[focal_outcome_idx_ind], 1)
+  
+  # Test Case 4: Test with actual colocboost call (if the function accepts the corrected logic)
+  # This would be a integration test to ensure the fix works end-to-end
+  
+  # Create a minimal working example
+  # Skip if colocboost doesn't have the fixed dict_keep_variables logic yet
+  if (exists("colocboost_init_data")) {
+    expect_error({
+      # This should work with the corrected dict_keep_variables mapping
+      cb_data_test <- colocboost_init_data(
+        X = list(X_superset),
+        Y = Y_list,
+        dict_YX = dict_YX,
+        Z = lapply(sumstat_list, function(s) s$beta / s$sebeta),  # z-scores
+        LD = list(LD_superset),
+        N_sumstat = lapply(sumstat_list, function(s) s$n[1]),
+        dict_sumstatLD = dict_sumstatLD,
+        Var_y = NULL,
+        SeBhat = NULL,
+        keep_variables = keep_variables,
+        focal_outcome_idx = 10,  # Should correctly map to 500 variables
+        focal_outcome_variables = TRUE,
+        overlap_variables = FALSE
+      )
+      
+      # The resulting variable names should be from the 5th sumstat (500 variables)
+      expect_equal(length(cb_data_test$variable.names), 500)
+    }, NA)
+  } else {
+    skip("colocboost_init_data not directly accessible for integration test")
+  }
+})
+
 # Test colocboost_assemble function
 test_that("colocboost_assemble processes model results", {
   

From a6e083b5bc49b5323558b526f22472168e7fcbd6 Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Thu, 31 Jul 2025 07:29:28 -0500
Subject: [PATCH 09/13] Update colocboost_update.R

Handle error since the best joint update jk_star is not included in the specific dataset. We will not consider LD for this update.
---
 R/colocboost_update.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/colocboost_update.R b/R/colocboost_update.R
index a75296d..5858a3e 100644
--- a/R/colocboost_update.R
+++ b/R/colocboost_update.R
@@ -152,7 +152,7 @@ get_LD_jk <- function(jk1, X = NULL, XtX = NULL, N = NULL, remain_idx = NULL, P
   } else if (!is.null(XtX)) {
     jk1.remain <- which(remain_idx == jk1)
     corr <- rep(0, P)
-    if (sum(XtX) == 1){
+    if (sum(XtX) == 1 | length(jk1.remain)==0){
       corr[remain_idx] <- 1
     } else {
       corr[remain_idx] <- XtX[, jk1.remain]

From c7efc1850ea735ff20793e24ab57f2ecf76dcf3c Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Mon, 4 Aug 2025 08:04:09 -0500
Subject: [PATCH 10/13] Adding uCoS filtering cirtieria

Adding uCoS filtering cirtieria
- check_null_max_ucos
---
 R/colocboost.R          | 23 ++++-------------------
 R/colocboost_assemble.R |  5 ++++-
 R/colocboost_utils.R    | 10 +++++++---
 3 files changed, 15 insertions(+), 23 deletions(-)

diff --git a/R/colocboost.R b/R/colocboost.R
index 51a6774..81e8490 100644
--- a/R/colocboost.R
+++ b/R/colocboost.R
@@ -82,7 +82,8 @@
 #' @param weight_fudge_factor The strength to integrate weight from different outcomes, default is 1.5
 #' @param check_null The cut off value for change conditional objective function. Default is 0.1.
 #' @param check_null_method The metric to check the null sets. Default is "profile"
-#' @param check_null_max The smallest value of change of profile loglikelihood for each outcome.
+#' @param check_null_max The smallest value of change of profile loglikelihood for each outcome in CoS.
+#' @param check_null_max_ucos The smallest value of change of profile loglikelihood for each outcome in uCoS.
 #' @param weaker_effect If \code{weaker_effect = TRUE}, consider the weaker single effect due to coupling effects
 #' @param LD_free When \code{LD_free = FALSE}, objective function doesn't include LD information.
 #' @param output_level When \code{output_level = 1}, return basic cos details for colocalization results
@@ -180,6 +181,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
                        check_null = 0.1, # the cut off value for change conditional objective function
                        check_null_method = "profile", # the metric to check the null sets.
                        check_null_max = 0.025, # the smallest value of change of profile loglikelihood for each outcome.
+                       check_null_max_ucos = 0.015, # the smallest value of change of profile loglikelihood for each outcome in uCoS.
                        weaker_effect = TRUE,
                        LD_free = FALSE,
                        output_level = 1) {
@@ -401,24 +403,6 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
       
       LD <- 1
       sumstatLD_dict <- rep(1, length(sumstat))
-      
-# 
-#       p.sumstat <- sapply(keep_variable_sumstat, length)
-#       p.unique <- unique(p.sumstat)
-#       if (length(p.unique) == 1) {
-#         ld <- diag(1, nrow = p.unique)
-#         colnames(ld) <- rownames(ld) <- keep_variable_sumstat[[1]]
-#         LD <- list(ld)
-#         sumstatLD_dict <- rep(1, length(sumstat))
-#       } else {
-#         LD <- lapply(keep_variable_sumstat, function(sn) {
-#           ld <- diag(1, nrow = length(sn))
-#           colnames(ld) <- rownames(ld) <- sn
-#           return(ld)
-#         })
-#         sumstatLD_dict <- 1:length(sumstat)
-#       }
-
       # change some algorithm parameters
       M <- 1 # one iteration
       min_abs_corr <- 0 # remove purity checking
@@ -626,6 +610,7 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
     check_null = check_null,
     check_null_method = check_null_method,
     check_null_max = check_null_max,
+    check_null_max_ucos = check_null_max_ucos,
     dedup = dedup,
     overlap = overlap,
     n_purity = n_purity,
diff --git a/R/colocboost_assemble.R b/R/colocboost_assemble.R
index 74f9efa..aba5405 100644
--- a/R/colocboost_assemble.R
+++ b/R/colocboost_assemble.R
@@ -25,6 +25,7 @@ colocboost_assemble <- function(cb_obj,
                                 check_null = 0.1,
                                 check_null_method = "profile",
                                 check_null_max = 0.025,
+                                check_null_max_ucos = 0.015,
                                 dedup = TRUE,
                                 overlap = TRUE,
                                 n_purity = 100,
@@ -79,7 +80,9 @@ colocboost_assemble <- function(cb_obj,
       # fixme later
       check_null_max <- check_null_max * check_null
     }
-    cb_obj <- get_max_profile(cb_obj, check_null_max = check_null_max, check_null_method = check_null_method)
+    cb_obj <- get_max_profile(cb_obj, check_null_max = check_null_max, 
+                              check_null_max_ucos = check_null_max_ucos, 
+                              check_null_method = check_null_method)
     # --------- about colocalized confidence sets ---------------------------------
     out_cos <- colocboost_assemble_cos(cb_obj,
       coverage = coverage,
diff --git a/R/colocboost_utils.R b/R/colocboost_utils.R
index 702590e..c252bf6 100644
--- a/R/colocboost_utils.R
+++ b/R/colocboost_utils.R
@@ -407,14 +407,18 @@ get_avWeigth <- function(cb_model, coloc_outcomes, update, pos.coloc, name_weigh
 }
 
 
-get_max_profile <- function(cb_obj, check_null_max = 0.02, check_null_method = "profile") {
+get_max_profile <- function(cb_obj, check_null_max = 0.025, 
+                            check_null_max_ucos = 0.015,
+                            check_null_method = "profile") {
   for (i in 1:cb_obj$cb_model_para$L) {
     cb <- cb_obj$cb_model[[i]]
     scaling_factor <- if (!is.null(cb_obj$cb_data$data[[i]]$N)) cb_obj$cb_data$data[[i]]$N - 1 else 1
     if (check_null_method == "profile") {
       cb$check_null_max <- 1000 * check_null_max / scaling_factor
+      cb$check_null_max_ucos <- 1000 * check_null_max_ucos / scaling_factor
     } else {
       cb$check_null_max <- check_null_max
+      cb$check_null_max_ucos <- check_null_max_ucos
     }
     cb_obj$cb_model[[i]] <- cb
   }
@@ -879,12 +883,12 @@ get_full_output <- function(cb_obj, past_out = NULL, variables = NULL, cb_output
       cs_change <- data.frame("ucos_outcome" = change_outcomes, "ucos_delta" = change_values)
 
       # - filter weak ucos
-      check_null_max <- sapply(cb_model, function(cb) cb$check_null_max)
+      check_null_max_ucos <- sapply(cb_model, function(cb) cb$check_null_max_ucos)
       remove_weak <- sapply(1:nrow(cs_change), function(ic) {
         outcome_tmp <- cs_change$ucos_outcome[ic]
         delta_tmp <- cs_change$ucos_delta[ic]
         pp <- which(cb_obj$cb_model_para$outcome_names == outcome_tmp)
-        check_tmp <- check_null_max[pp]
+        check_tmp <- check_null_max_ucos[pp]
         delta_tmp >= check_tmp
       })
       keep_ucos <- which(remove_weak)

From 2b236793360d9eea4cc39efe840697ea0f78029d Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Fri, 22 Aug 2025 13:18:35 -0400
Subject: [PATCH 11/13] Update colocboost_plot.R

adding horizontal and vertical spaces of text labels.
---
 R/colocboost_plot.R | 60 ++++++++++++++++++++++++++++++++++++---------
 1 file changed, 48 insertions(+), 12 deletions(-)

diff --git a/R/colocboost_plot.R b/R/colocboost_plot.R
index 9381837..90141d6 100644
--- a/R/colocboost_plot.R
+++ b/R/colocboost_plot.R
@@ -29,7 +29,7 @@
 #' @param ylim_each Logical, if TRUE uses separate y-axis limits for each plot, default is TRUE
 #' @param outcome_legend_pos Position for outcome legend, default is "top"
 #' @param outcome_legend_size Size for outcome legend text, default is 1.2
-#' @param cos_legend_pos Proportion of the legend from (left edge, bottom edge), default as (0.05, 0.4) at the left - median position
+#' @param cos_legend_pos Proportion of the legend from (left edge, bottom edge) following by the distance for multiple labels (horizontal-x space, vertical-y space), default as (0.05, 0.4, 0.1, 0.5) at the left - median position
 #' @param show_variable Logical, if TRUE displays variant IDs, default is FALSE
 #' @param lab_style Vector of two numbers for label style (size, boldness), default is c(2, 1)
 #' @param axis_style Vector of two numbers for axis style (size, boldness), default is c(2, 1)
@@ -93,7 +93,7 @@ colocboost_plot <- function(cb_output, y = "log10p",
                             ylim_each = TRUE,
                             outcome_legend_pos = "top",
                             outcome_legend_size = 1.8,
-                            cos_legend_pos = c(0.05, 0.4),
+                            cos_legend_pos = c(0.05, 0.4, 0.1, 0.5),
                             show_variable = FALSE,
                             lab_style = c(2, 1),
                             axis_style = c(2, 1),
@@ -301,7 +301,7 @@ colocboost_plot <- function(cb_output, y = "log10p",
           y_pos <- usr[3] + cb_plot_init$cos_legend_pos[2] * (usr[4] - usr[3])  # 50% from bottom edge
           legend(x = x_pos, y = y_pos, texts,
             bty = "n", col = shape_col, text.col = texts_col,
-            cex = 1.5, pt.cex = 1.5, pch = 4, x.intersp = 0.1, y.intersp = 0.5
+            cex = 1.5, pt.cex = 1.5, pch = 4, x.intersp = cos_legend_pos[3], y.intersp = cos_legend_pos[4]
           )
         }
       }
@@ -638,19 +638,55 @@ plot_initial <- function(cb_plot_input, y = "log10p",
   # - set background point color and cos color pools
   args$bg <- points_color
   if (is.null(cos_color)) {
-    # cos_color <- c("dodgerblue2", "#6A3D9A", "#FF7F00", "#FB9A99", "#33A02C",
-    #                "#A6CEE3",  "gold1", "#01665E","#FDBF6F", "#CAB2D6", "#B2DF8A",
-    #                "#8C510A", "#BF812D", "#DFC27D", "#F6E8C3", "#01665E",
-    #                "#35978F", "#80CDC1", "#C7EAE5", "#003C30")
-    # cos_color <- c("#1B9E77", "#D95F02", "#7570B3", "#1F78B4", "#66A61E",
-    #                "#E6AB02", "#A6761D", "#666666", "#E7298A", "#B2182B",
-    #                "#D73027", "#4575B4", "#313695", "#542788", "#74ADD1",
-    #                "#F46D43", "#4DAF4A", "#984EA3", "#FF7F00", "#A50F15")
+
+    # cos_color <- c(
+    #   "#377EB8", "#E69F00", "#33A02C", "#984EA3", "#F46D43",
+    #   "#A65628", "#1F4E79", "#B2182B", "#D73027", "#F781BF",
+    #   "#4DAF4A", "#E41A1C", "#FF7F00", "#6A3D9A", "#1B7837",
+    #   "#E6AB02", "#542788", "#74ADD1", "#A50F15", "#01665E"
+    # )
     cos_color <- c(
+      # Original 20 colors (preserved exactly)
       "#377EB8", "#E69F00", "#33A02C", "#984EA3", "#F46D43",
       "#A65628", "#1F4E79", "#B2182B", "#D73027", "#F781BF",
       "#4DAF4A", "#E41A1C", "#FF7F00", "#6A3D9A", "#1B7837",
-      "#E6AB02", "#542788", "#74ADD1", "#A50F15", "#01665E"
+      "#E6AB02", "#542788", "#74ADD1", "#A50F15", "#01665E",
+      
+      # Additional 80 colors - carefully selected to complement the original palette
+      # Blues and Teals
+      "#5DADE2", "#85C1E9", "#2E86C1", "#1B4F72", "#154360",
+      "#AED6F1", "#3498DB", "#21618C", "#1A5490", "#17A2B8",
+      "#20B2AA", "#008B8B", "#4682B4", "#87CEEB", "#B0E0E6",
+      
+      # Greens
+      "#58D68D", "#82E0AA", "#27AE60", "#1E8449", "#186A3B",
+      "#ABEBC6", "#2ECC71", "#229954", "#196F3D", "#52C41A",
+      "#90EE90", "#32CD32", "#228B22", "#006400", "#9ACD32",
+      
+      # Reds and Pinks
+      "#EC7063", "#F1948A", "#CB4335", "#A93226", "#922B21",
+      "#FADBD8", "#E74C3C", "#C0392B", "#943126", "#FF6B6B",
+      "#FF69B4", "#FFB6C1", "#FF1493", "#DC143C", "#B22222",
+      
+      # Oranges and Yellows
+      "#F8C471", "#FAD7A0", "#E67E22", "#CA6F1E", "#B7950B",
+      "#FCF3CF", "#F39C12", "#D68910", "#B7950B", "#FFA500",
+      "#FFFF00", "#FFD700", "#FFA500", "#FF8C00", "#DAA520",
+      
+      # Purples and Violets
+      "#BB8FCE", "#D2B4DE", "#8E44AD", "#7D3C98", "#6C3483",
+      "#E8DAEF", "#9B59B6", "#7E57C2", "#673AB7", "#9C27B0",
+      "#8A2BE2", "#9370DB", "#BA55D3", "#DA70D6", "#DDA0DD",
+      
+      # Browns and Earth Tones
+      "#CD853F", "#D2691E", "#A0522D", "#8B4513", "#654321",
+      "#DEB887", "#F4A460", "#BC8F8F", "#D2B48C", "#F5DEB3",
+      "#8B7355", "#A0522D", "#CD853F", "#DEB887", "#D2691E",
+      
+      # Grays and Neutrals
+      "#85929E", "#AEB6BF", "#5D6D7E", "#34495E", "#2C3E50",
+      "#D5D8DC", "#BDC3C7", "#95A5A6", "#7F8C8D", "#17202A",
+      "#808080", "#A9A9A9", "#C0C0C0", "#696969", "#778899"
     )
 
 

From fa39de99d46fe8349416a88acc3a871810213c9f Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Fri, 22 Aug 2025 22:16:24 -0400
Subject: [PATCH 12/13] Update colocboost_utils.R

fix error when variable names in different datasets have different orders (e.g., dataset1 has A then B, but dataset2 has B then A).
---
 R/colocboost_utils.R | 50 +++++++++++++++++++++++++++++++-------------
 1 file changed, 36 insertions(+), 14 deletions(-)

diff --git a/R/colocboost_utils.R b/R/colocboost_utils.R
index c252bf6..c4da011 100644
--- a/R/colocboost_utils.R
+++ b/R/colocboost_utils.R
@@ -438,26 +438,49 @@ w_cs <- function(weights, coverage = 0.95) {
 #' Pure R implementation (fallback)
 #' @noRd
 get_merge_ordered_with_indices <- function(vector_list) {
-  # Quick validation
-  if (!is.list(vector_list) || length(vector_list) == 0) {
-    stop("Input must be a non-empty list of vectors")
+  
+  # Fallback function for priority-based merging (embedded)
+  get_merge_ordered_with_priority_fallback <- function(vector_list) {
+    # Convert all vectors to character
+    vector_list <- lapply(vector_list, as.character)
+    
+    # Start with empty result
+    result <- character()
+    processed_elements <- character()
+    
+    # Process vectors in priority order (1st, 2nd, 3rd, etc.)
+    for (i in seq_along(vector_list)) {
+      current_vec <- vector_list[[i]]
+      
+      # Add elements from current vector that haven't been processed yet
+      new_elements <- setdiff(current_vec, processed_elements)
+      
+      if (length(new_elements) > 0) {
+        # Find positions of new elements in current vector to maintain relative order
+        new_positions <- which(current_vec %in% new_elements)
+        ordered_new_elements <- current_vec[new_positions]
+        
+        # Add to result
+        result <- c(result, ordered_new_elements)
+        processed_elements <- c(processed_elements, ordered_new_elements)
+      }
+    }
+    
+    return(result)
   }
-
+  
   # Convert all vectors to character
   vector_list <- lapply(vector_list, as.character)
   n_vectors <- length(vector_list)
-
   # Step 1: Get all unique elements
   all_elements <- unique(unlist(vector_list))
   n_elements <- length(all_elements)
-
   # Step 2: Build a graph of ordering constraints
   # Use an adjacency list: for each element, store elements that must come after it
   graph <- new.env(hash = TRUE, parent = emptyenv(), size = n_elements)
   for (elem in all_elements) {
     graph[[elem]] <- character()
   }
-
   # Add edges based on consecutive pairs in each vector
   for (vec in vector_list) {
     for (i in seq_len(length(vec) - 1)) {
@@ -469,7 +492,6 @@ get_merge_ordered_with_indices <- function(vector_list) {
       }
     }
   }
-
   # Step 3: Compute in-degrees (number of incoming edges for each node)
   in_degree <- new.env(hash = TRUE, parent = emptyenv(), size = n_elements)
   for (elem in all_elements) {
@@ -480,7 +502,6 @@ get_merge_ordered_with_indices <- function(vector_list) {
       in_degree[[to_elem]] <- in_degree[[to_elem]] + 1
     }
   }
-
   # Step 4: Topological sort using Kahn's algorithm
   # Start with nodes that have no incoming edges
   queue <- all_elements[sapply(all_elements, function(elem) in_degree[[elem]] == 0)]
@@ -490,7 +511,6 @@ get_merge_ordered_with_indices <- function(vector_list) {
     current <- queue[1]
     queue <- queue[-1]
     result <- c(result, current)
-
     # Process all neighbors (elements that must come after current)
     neighbors <- graph[[current]]
     for (next_elem in neighbors) {
@@ -500,12 +520,14 @@ get_merge_ordered_with_indices <- function(vector_list) {
       }
     }
   }
-
-  # Step 5: Check for cycles (if result doesn't include all elements, there’s a cycle)
+  # Step 5: Check for cycles and use fallback if needed
   if (length(result) != n_elements) {
-    stop("Cycle detected in ordering constraints; cannot produce a valid merged order")
+    # Different variable orders detected - use priority-based fallback
+    warning("Variable names in different datasets have different orders (e.g., dataset1 has A then B, but dataset2 has B then A). ",
+            "Using priority-based fallback to prioritize the variable order in the earlier datasets.")
+    result <- get_merge_ordered_with_priority_fallback(vector_list)
   }
-
+  
   result
 }
 

From aeb4d1cc0d5b3d874476257be7dfa5c813120957 Mon Sep 17 00:00:00 2001
From: xueweic <xueweic@users.noreply.github.com>
Date: Mon, 25 Aug 2025 16:38:13 +0000
Subject: [PATCH 13/13] Update documentation

---
 man/colocboost.Rd      | 5 ++++-
 man/colocboost_plot.Rd | 4 ++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/man/colocboost.Rd b/man/colocboost.Rd
index 41c1488..cd9fcca 100644
--- a/man/colocboost.Rd
+++ b/man/colocboost.Rd
@@ -59,6 +59,7 @@ colocboost(
   check_null = 0.1,
   check_null_method = "profile",
   check_null_max = 0.025,
+  check_null_max_ucos = 0.015,
   weaker_effect = TRUE,
   LD_free = FALSE,
   output_level = 1
@@ -182,7 +183,9 @@ to merge colocalized sets, which may resulting in a huge set.}
 
 \item{check_null_method}{The metric to check the null sets. Default is "profile"}
 
-\item{check_null_max}{The smallest value of change of profile loglikelihood for each outcome.}
+\item{check_null_max}{The smallest value of change of profile loglikelihood for each outcome in CoS.}
+
+\item{check_null_max_ucos}{The smallest value of change of profile loglikelihood for each outcome in uCoS.}
 
 \item{weaker_effect}{If \code{weaker_effect = TRUE}, consider the weaker single effect due to coupling effects}
 
diff --git a/man/colocboost_plot.Rd b/man/colocboost_plot.Rd
index 0fdd21c..fe155fc 100644
--- a/man/colocboost_plot.Rd
+++ b/man/colocboost_plot.Rd
@@ -33,7 +33,7 @@ colocboost_plot(
   ylim_each = TRUE,
   outcome_legend_pos = "top",
   outcome_legend_size = 1.8,
-  cos_legend_pos = c(0.05, 0.4),
+  cos_legend_pos = c(0.05, 0.4, 0.1, 0.5),
   show_variable = FALSE,
   lab_style = c(2, 1),
   axis_style = c(2, 1),
@@ -92,7 +92,7 @@ colocboost_plot(
 
 \item{outcome_legend_size}{Size for outcome legend text, default is 1.2}
 
-\item{cos_legend_pos}{Proportion of the legend from (left edge, bottom edge), default as (0.05, 0.4) at the left - median position}
+\item{cos_legend_pos}{Proportion of the legend from (left edge, bottom edge) following by the distance for multiple labels (horizontal-x space, vertical-y space), default as (0.05, 0.4, 0.1, 0.5) at the left - median position}
 
 \item{show_variable}{Logical, if TRUE displays variant IDs, default is FALSE}