From 85290cc4cc850a3bb04808177b7e50fb31de2703 Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Thu, 17 Apr 2025 08:14:10 -0400
Subject: [PATCH 1/3] cran

---
 README.md                  | 3 ++-
 vignettes/installation.Rmd | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index e36bbf4..700796a 100644
--- a/README.md
+++ b/README.md
@@ -8,9 +8,10 @@ This R package implements ColocBoost---motivated and designed for colocalization
 
 ## Quick Start
 
-### CRAN Installation
+### CRAN Installation (available soon)
 Install released versions from CRAN (Linux, macOS and Windows)
 
+
 ```r
 install.packages("colocboost")
 ```
diff --git a/vignettes/installation.Rmd b/vignettes/installation.Rmd
index 81ba77c..d83085b 100644
--- a/vignettes/installation.Rmd
+++ b/vignettes/installation.Rmd
@@ -16,7 +16,7 @@ knitr::opts_chunk$set(
 
 ## Installation
 
-### CRAN
+### CRAN (available soon)
 Install released versions from CRAN (Linux, macOS and Windows)
 
 ```r

From ac2ae47d6c92d28036c1738d11d10826c1e67905 Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Thu, 17 Apr 2025 13:21:45 -0400
Subject: [PATCH 2/3] add one feature for superset X

---
 DESCRIPTION                                   |   3 +-
 R/colocboost.R                                |  24 +--
 R/colocboost_init.R                           | 181 ++++++++++++------
 man/colocboost.Rd                             |  14 +-
 man/colocboost_plot.Rd                        |  15 +-
 man/get_cormat.Rd                             |   4 +-
 man/get_cos.Rd                                |  14 +-
 man/get_cos_summary.Rd                        |  14 +-
 man/get_strong_colocalization.Rd              |  14 +-
 vignettes/Individual_Level_Colocalization.Rmd |  41 +++-
 vignettes/Input_Data_Format.Rmd               |  49 ++++-
 .../Summary_Statistics_Colocalization.Rmd     |   4 +-
 12 files changed, 250 insertions(+), 127 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 72ab83e..b5787c3 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -28,9 +28,10 @@ Suggests:
     testthat (>= 3.0.0),
     knitr,
     rmarkdown,
+    bookdown,
     ashr,
     MASS
-VignetteBuilder: knitr
+VignetteBuilder: knitr, bookdown
 Roxygen: list(markdown = TRUE)
 Config/testthat/edition: 3
 License: MIT + file LICENSE
diff --git a/R/colocboost.R b/R/colocboost.R
index e9a746b..e88ba5d 100644
--- a/R/colocboost.R
+++ b/R/colocboost.R
@@ -191,15 +191,9 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
   # - check individual level data
   if (!is.null(X) & !is.null(Y)) {
     # --- check input
-    if (is.data.frame(X)) {
-      X <- as.matrix(X)
-    }
-    if (is.data.frame(Y)) {
-      Y <- as.matrix(Y)
-    }
-    if (is.matrix(X)) {
-      X <- list(X)
-    }
+    if (is.data.frame(X))  X <- as.matrix(X)
+    if (is.data.frame(Y)) Y <- as.matrix(Y)
+    if (is.matrix(X))  X <- list(X)
     if (is.atomic(Y) && !is.list(Y)) {
       Y <- as.matrix(Y)
       if (ncol(Y) == 1) {
@@ -216,23 +210,17 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
       }
     } else {
       Y <- lapply(1:length(Y), function(ii) {
-        if (is.null(dict_YX)) {
-          idx <- ii
-        } else {
-          idx <- dict_YX[ii, 2]
-        }
-        n <- nrow(X[[idx]])
         y <- Y[[ii]]
         y <- as.matrix(y)
+        n <- length(y)
         if (nrow(y) == n) {
           return(y)
         } else if (ncol(y) == n) {
           return(t(y))
-        } else {
-          stop("X and Y do not have the same sample size!")
-        }
+        } 
       })
     }
+    
     # --- check if variables in individual data
     p.ind <- unique(sapply(X, ncol))
     if (length(p.ind) != 1) {
diff --git a/R/colocboost_init.R b/R/colocboost_init.R
index 0ee9887..6368dfd 100644
--- a/R/colocboost_init.R
+++ b/R/colocboost_init.R
@@ -68,69 +68,18 @@ colocboost_init_data <- function(X, Y, dict_YX,
   flag <- 1
   # if individual: X, Y
   if (!is.null(X) & !is.null(Y)) {
-    drop_lowfreq <- c()
-    dict_YX_final <- dict_YX
-    for (ij in 1:length(X)) {
-      index <- which(dict_YX == ij)
-      dict_YX_final[index] <- index[1]
-    }
+    
+    ind_formated <- process_individual_data(
+      X, Y, dict_YX, target_variants = keep_variable_names,
+      intercept = intercept, standardize = standardize
+    )
     for (i in 1:length(Y)) {
-      tmp <- list(
-        "X" = NULL,
-        "Y" = scale(Y[[i]], center = intercept, scale = standardize),
-        "N" = length(Y[[i]]),
-        "variable_miss" = NULL
-      )
-      x_tmp <- X[[dict_YX[i]]]
-      change_x <- if (dict_YX_final[i] == i) TRUE else FALSE
-      # - if sample different
-      if (nrow(x_tmp) != length(Y[[i]])) {
-        change_x <- TRUE
-        ind_id_Y <- rownames(Y[[i]])
-        ind_id_X <- rownames(x_tmp)
-        if (is.null(ind_id_X) | is.null(ind_id_Y)) {
-          stop("Please provide the sample index of X and Y, since they do not have the same samples!")
-        } else {
-          pos <- match(ind_id_Y, ind_id_X)
-          x_tmp <- x_tmp[pos, , drop = FALSE]
-          if (sum(is.na(pos)) != 0) {
-            tmp$Y <- tmp$Y[-which(is.na(pos))]
-          }
-        }
-      }
-      # - if missing X
-      variable.name <- keep_variables[[dict_YX[i]]]
-      if (length(variable.name) != length(keep_variable_names)) {
-        x_extend <- matrix(0,
-          nrow = nrow(x_tmp), ncol = length(keep_variable_names),
-          dimnames = list(rownames(x_tmp), keep_variable_names)
-        )
-        variable.tmp <- intersect(keep_variable_names, variable.name)
-        pos <- match(variable.tmp, keep_variable_names)
-        tmp$variable_miss <- setdiff(1:length(keep_variable_names), pos)
-        poss <- match(variable.tmp, variable.name)
-        x_extend[, pos] <- x_tmp[, poss]
-        x_tmp <- x_extend
-      }
-      if (change_x) {
-        dict_YX_final[i] == i
-        if (!intercept & !standardize) {
-          x_stand <- x_tmp
-        } else {
-          x_stand <- Rfast::standardise(x_tmp, center = intercept, scale = standardize)
-        }
-        x_stand[which(is.na(x_stand))] <- 0
-        tmp$X <- x_stand
-      }
-      cb_data$data[[flag]] <- tmp
+      cb_data$data[[flag]] <- ind_formated$result[[i]]
       names(cb_data$data)[flag] <- paste0("ind_outcome_", i)
       flag <- flag + 1
     }
-    cb_data$dict <- c(dict_YX_final)
-    ind_idx <- max(dict_YX)
-  } else {
-    ind_idx <- 0
-  }
+    cb_data$dict <- c(ind_formated$new_dict)
+  } 
   n_ind <- flag - 1
   # if summary: XtX XtY, YtY
   if (!is.null(Z) & !is.null(LD)) {
@@ -649,3 +598,117 @@ process_sumstat <- function(Z, N, Var_y, SeBhat, ld_matrices, variant_lists, dic
     original_dict = dict
   ))
 }
+
+#' process individual level input format
+#' @noRd
+process_individual_data <- function(X, Y, dict_YX, target_variants,
+                                    intercept = TRUE,
+                                    standardize = TRUE) {
+  
+  # Step 0: Check if sample IDs match between Y and corresponding X
+  for (i in 1:length(Y)) {
+    current_matrix_type <- dict_YX[i]
+    # If row counts match, we assume samples are in the same order
+    if (nrow(Y[[i]]) != nrow(X[[current_matrix_type]])) {
+      # Row counts don't match, so check rownames
+      ind_id_Y <- rownames(Y[[i]])
+      ind_id_X <- rownames(X[[current_matrix_type]])
+      if (is.null(ind_id_X) || is.null(ind_id_Y)) {
+        stop("Please provide the sample index of X and Y, since they do not have the same samples!")
+      }
+      # Find matching samples
+      pos <- match(ind_id_Y, ind_id_X)
+      if (sum(!is.na(pos)) == 0) {
+        stop("No samples in Y match any samples in the corresponding X matrix!")
+      }
+    }
+  }
+  
+  # Step 1: Update dictionary to handle duplicates samples
+  unique_count <- 0
+  new_dict <- integer(length(dict_YX))
+  sample_lists <- lapply(Y, rownames)
+  for (i in 1:length(sample_lists)) {
+    # Check if we've seen this list before with the same matrix type
+    found <- FALSE
+    current_matrix_type <- dict_YX[i]
+    # Only check previous lists if i > 1
+    if (i > 1) {
+      for (j in 1:(i-1)) {
+        # Only compare if they're from the same matrix
+        if (dict_YX[j] == current_matrix_type && 
+            identical(sample_lists[[i]], sample_lists[[j]])) {
+          new_dict[i] <- new_dict[j]
+          found <- TRUE
+          break
+        }
+      }
+    }
+    # If not seen before, assign new unique index
+    if (!found) {
+      unique_count <- unique_count + 1
+      new_dict[i] <- unique_count
+    }
+  }
+  
+  # Step 2: Create result list
+  result <- list()
+  
+  for (i in 1:length(Y)) {
+    tmp <- list(
+      "X" = NULL,
+      "Y" = NULL,
+      "N" = NULL,
+      "variable_miss" = NULL
+    )
+    matrix_type <- dict_YX[i]
+    # Get the appropriate matrix from X list
+    current_X <- X[[matrix_type]]
+    current_Y <- Y[[i]]
+    # Check if we need to match samples or if we can use as-is
+    if (nrow(current_Y) == nrow(current_X)) {
+      # Same number of rows, assume same order
+      matched_X <- current_X
+      matched_Y <- scale(current_Y, center = intercept, scale = standardize)
+    } else {
+      # Different number of rows, find matching samples
+      overlap_samples <- intersect(rownames(current_Y), rownames(current_X))
+      matched_X <- current_X[match(overlap_samples, rownames(current_X)), , drop = FALSE]
+      matched_Y <- current_Y[match(overlap_samples, rownames(current_Y)), , drop = FALSE]
+      matched_Y <- scale(matched_Y, center = intercept, scale = standardize)
+    }
+    tmp$Y <- matched_Y
+    tmp$N <- length(matched_Y)
+    
+    # - if missing X
+    variable.name <- colnames(matched_X)
+    if (length(variable.name) != length(target_variants)) {
+      x_extend <- matrix(0, nrow = nrow(matched_X), ncol = length(target_variants),
+                         dimnames = list(rownames(matched_X), target_variants) )
+      variable.tmp <- intersect(target_variants, variable.name)
+      pos <- match(variable.tmp, target_variants)
+      tmp$variable_miss <- setdiff(1:length(target_variants), pos)
+      poss <- match(variable.tmp, variable.name)
+      x_extend[, pos] <- matched_X[, poss]
+      matched_X <- x_extend
+    }
+    if (new_dict[i] == i) {
+      if (!intercept & !standardize) {
+        x_stand <- matched_X
+      } else {
+        x_stand <- Rfast::standardise(matched_X, center = intercept, scale = standardize)
+      }
+      x_stand[which(is.na(x_stand))] <- 0
+      tmp$X <- x_stand
+    }
+    
+    # Create components for each list
+    result[[i]] <- tmp
+  }
+  # Return results with the unified dictionary
+  return(list(
+    result = result,
+    new_dict = new_dict,
+    original_dict = dict_YX
+  ))
+}
diff --git a/man/colocboost.Rd b/man/colocboost.Rd
index 11ca213..28c9d4f 100644
--- a/man/colocboost.Rd
+++ b/man/colocboost.Rd
@@ -213,20 +213,22 @@ that may arise from small sample sizes or discrepancies in minor allele frequenc
 \examples{
 # colocboost example
 set.seed(1)
-N = 1000
-P = 100
+N <- 1000
+P <- 100
 # Generate X with LD structure
 sigma <- 0.9^abs(outer(1:P, 1:P, "-"))
 X <- MASS::mvrnorm(N, rep(0, P), sigma)
 colnames(X) <- paste0("SNP", 1:P)
-L = 3
+L <- 3
 true_beta <- matrix(0, P, L)
-true_beta[10, 1] <- 0.5  # SNP10 affects trait 1
-true_beta[10, 2] <- 0.4  # SNP10 also affects trait 2 (colocalized)
+true_beta[10, 1] <- 0.5 # SNP10 affects trait 1
+true_beta[10, 2] <- 0.4 # SNP10 also affects trait 2 (colocalized)
 true_beta[50, 2] <- 0.3 # SNP50 only affects trait 2
 true_beta[80, 3] <- 0.6 # SNP80 only affects trait 3
 Y <- matrix(0, N, L)
-for (l in 1:L){  Y[, l] <- X \%*\% true_beta[, l] + rnorm(N, 0, 1) }
+for (l in 1:L) {
+  Y[, l] <- X \%*\% true_beta[, l] + rnorm(N, 0, 1)
+}
 res <- colocboost(X = X, Y = Y)
 res$cos_details$cos$cos_index
 
diff --git a/man/colocboost_plot.Rd b/man/colocboost_plot.Rd
index 0c74a54..bff458a 100644
--- a/man/colocboost_plot.Rd
+++ b/man/colocboost_plot.Rd
@@ -98,24 +98,25 @@ Visualization plot for each colcoalization event.
 \examples{
 # colocboost example
 set.seed(1)
-N = 1000
-P = 100
+N <- 1000
+P <- 100
 # Generate X with LD structure
 sigma <- 0.9^abs(outer(1:P, 1:P, "-"))
 X <- MASS::mvrnorm(N, rep(0, P), sigma)
 colnames(X) <- paste0("SNP", 1:P)
-L = 3
+L <- 3
 true_beta <- matrix(0, P, L)
-true_beta[10, 1] <- 0.5  # SNP10 affects trait 1
-true_beta[10, 2] <- 0.4  # SNP10 also affects trait 2 (colocalized)
+true_beta[10, 1] <- 0.5 # SNP10 affects trait 1
+true_beta[10, 2] <- 0.4 # SNP10 also affects trait 2 (colocalized)
 true_beta[50, 2] <- 0.3 # SNP50 only affects trait 2
 true_beta[80, 3] <- 0.6 # SNP80 only affects trait 3
 Y <- matrix(0, N, L)
-for (l in 1:L){  Y[, l] <- X \%*\% true_beta[, l] + rnorm(N, 0, 1) }
+for (l in 1:L) {
+  Y[, l] <- X \%*\% true_beta[, l] + rnorm(N, 0, 1)
+}
 res <- colocboost(X = X, Y = Y)
 colocboost_plot(res, plot_cols = 1)
 colocboost_plot(res, plot_cols = 1, outcome_idx = 1:3)
 
-
 }
 \concept{colocboost_plot}
diff --git a/man/get_cormat.Rd b/man/get_cormat.Rd
index f01aed9..e089c1c 100644
--- a/man/get_cormat.Rd
+++ b/man/get_cormat.Rd
@@ -20,8 +20,8 @@ This function calculates the correlation matrix (LD matrix) from individual leve
 \examples{
 # colocboost example
 set.seed(1)
-N = 1000
-P = 100
+N <- 1000
+P <- 100
 # Generate X with LD structure
 sigma <- 0.9^abs(outer(1:P, 1:P, "-"))
 X <- MASS::mvrnorm(N, rep(0, P), sigma)
diff --git a/man/get_cos.Rd b/man/get_cos.Rd
index 69f3764..b4a94d0 100644
--- a/man/get_cos.Rd
+++ b/man/get_cos.Rd
@@ -20,20 +20,22 @@ A list of indices of variables in each CoS.
 \examples{
 # colocboost example
 set.seed(1)
-N = 1000
-P = 100
+N <- 1000
+P <- 100
 # Generate X with LD structure
 sigma <- 0.9^abs(outer(1:P, 1:P, "-"))
 X <- MASS::mvrnorm(N, rep(0, P), sigma)
 colnames(X) <- paste0("SNP", 1:P)
-L = 3
+L <- 3
 true_beta <- matrix(0, P, L)
-true_beta[10, 1] <- 0.5  # SNP10 affects trait 1
-true_beta[10, 2] <- 0.4  # SNP10 also affects trait 2 (colocalized)
+true_beta[10, 1] <- 0.5 # SNP10 affects trait 1
+true_beta[10, 2] <- 0.4 # SNP10 also affects trait 2 (colocalized)
 true_beta[50, 2] <- 0.3 # SNP50 only affects trait 2
 true_beta[80, 3] <- 0.6 # SNP80 only affects trait 3
 Y <- matrix(0, N, L)
-for (l in 1:L){  Y[, l] <- X \%*\% true_beta[, l] + rnorm(N, 0, 1) }
+for (l in 1:L) {
+  Y[, l] <- X \%*\% true_beta[, l] + rnorm(N, 0, 1)
+}
 res <- colocboost(X = X, Y = Y)
 get_cos(res, coverage = 0.75)
 
diff --git a/man/get_cos_summary.Rd b/man/get_cos_summary.Rd
index 4e4197b..73e4cf9 100644
--- a/man/get_cos_summary.Rd
+++ b/man/get_cos_summary.Rd
@@ -41,20 +41,22 @@ A summary table for colocalization events with the following columns:
 \examples{
 # colocboost example
 set.seed(1)
-N = 1000
-P = 100
+N <- 1000
+P <- 100
 # Generate X with LD structure
 sigma <- 0.9^abs(outer(1:P, 1:P, "-"))
 X <- MASS::mvrnorm(N, rep(0, P), sigma)
 colnames(X) <- paste0("SNP", 1:P)
-L = 3
+L <- 3
 true_beta <- matrix(0, P, L)
-true_beta[10, 1] <- 0.5  # SNP10 affects trait 1
-true_beta[10, 2] <- 0.4  # SNP10 also affects trait 2 (colocalized)
+true_beta[10, 1] <- 0.5 # SNP10 affects trait 1
+true_beta[10, 2] <- 0.4 # SNP10 also affects trait 2 (colocalized)
 true_beta[50, 2] <- 0.3 # SNP50 only affects trait 2
 true_beta[80, 3] <- 0.6 # SNP80 only affects trait 3
 Y <- matrix(0, N, L)
-for (l in 1:L){  Y[, l] <- X \%*\% true_beta[, l] + rnorm(N, 0, 1) }
+for (l in 1:L) {
+  Y[, l] <- X \%*\% true_beta[, l] + rnorm(N, 0, 1)
+}
 res <- colocboost(X = X, Y = Y)
 get_cos_summary(res)
 
diff --git a/man/get_strong_colocalization.Rd b/man/get_strong_colocalization.Rd
index 0d43f37..5bc2c93 100644
--- a/man/get_strong_colocalization.Rd
+++ b/man/get_strong_colocalization.Rd
@@ -41,20 +41,22 @@ A \code{"colocboost"} object with some or all of the following elements:
 \examples{
 # colocboost example
 set.seed(1)
-N = 1000
-P = 100
+N <- 1000
+P <- 100
 # Generate X with LD structure
 sigma <- 0.9^abs(outer(1:P, 1:P, "-"))
 X <- MASS::mvrnorm(N, rep(0, P), sigma)
 colnames(X) <- paste0("SNP", 1:P)
-L = 3
+L <- 3
 true_beta <- matrix(0, P, L)
-true_beta[10, 1] <- 0.5  # SNP10 affects trait 1
-true_beta[10, 2] <- 0.4  # SNP10 also affects trait 2 (colocalized)
+true_beta[10, 1] <- 0.5 # SNP10 affects trait 1
+true_beta[10, 2] <- 0.4 # SNP10 also affects trait 2 (colocalized)
 true_beta[50, 2] <- 0.3 # SNP50 only affects trait 2
 true_beta[80, 3] <- 0.6 # SNP80 only affects trait 3
 Y <- matrix(0, N, L)
-for (l in 1:L){  Y[, l] <- X \%*\% true_beta[, l] + rnorm(N, 0, 1) }
+for (l in 1:L) {
+  Y[, l] <- X \%*\% true_beta[, l] + rnorm(N, 0, 1)
+}
 res <- colocboost(X = X, Y = Y)
 res$cos_details$cos$cos_index
 filter_res <- get_strong_colocalization(res, cos_npc_cutoff = 0.5, npc_outcome_cutoff = 0.2)
diff --git a/vignettes/Individual_Level_Colocalization.Rmd b/vignettes/Individual_Level_Colocalization.Rmd
index 2f38a96..74a89a4 100644
--- a/vignettes/Individual_Level_Colocalization.Rmd
+++ b/vignettes/Individual_Level_Colocalization.Rmd
@@ -87,7 +87,9 @@ colocboost_plot(res)
 
 ### Results Interpretation
 
-For comprehensive tutorials on result interpretation and advanced visualization techniques, please visit our documentation portal at FIXME (link).
+For comprehensive tutorials on result interpretation and advanced visualization techniques, please visit our documentation portal 
+at [Visualization of ColocBoost Results](https://statfungen.github.io/colocboost/articles/Visualization_ColocBoost_Output.html) and
+[Interpret ColocBoost Output](https://statfungen.github.io/colocboost/articles/Interpret_ColocBoost_Output.html).
 
 
 # 3. Other structures of individual level data
@@ -100,21 +102,48 @@ This is particularly useful when the same individuals are used for different tra
 
 - **Input Format**:
     - `X` is a single matrix containing genotype data for all individuals. 
-    - `Y` can be i) a matrix with N * L dimension; ii) a list of phenotype vectors for L traits.
+    - `Y` can be i) a matrix with $N \times L$ dimension; ii) a list of phenotype vectors for $L$ traits.
 
 
 ```{r single-x}
 # Extract a single SNP (as a vector)
-X_single <- X[[1]]  # First SNP for all individuals
+# X_single <- X[[1]]  # First SNP for all individuals
 
 # Run colocboost
-res <- colocboost(X = X, Y = Y)
+# res <- colocboost(X = X_single, Y = Y)
 
 # Identified CoS
-res$cos_details$cos$cos_index
+# res$cos_details$cos$cos_index
+```
+
+
+## 3.2. Genotype matrix is a superset of samples across different phenotypes
+
+When the LD matrix includes a superset of variants across different summary statistics, with **Input Format**:
+
+- `sumstat` is a list of data.frames for all traits
+- `LD` is a matrix of linkage disequilibrium (LD) information for all variants across all traits.
+- The LD matrix should contain all variants present in the summary statistics data frames.
+- This is particularly useful when you have a large LD matrix from a reference panel and want to use it for multiple summary statistics datasets. 
+It allows for efficient analysis without redundancy.
+
+
+```{r superset-X}
+# Create sumstat with different number of variants - remove 100 variants in each sumstat
+# LD_superset <- LD
+# sumstat <- lapply(Sumstat_5traits$sumstat, function(x) x[-sample(1:nrow(x), 100), , drop = FALSE])
+
+# Run colocboost
+# res <- colocboost(sumstat = sumstat, LD = LD_superset)
+
+# Identified CoS
+# res$cos_details$cos$cos_index
 ```
 
-## 3.2. Arbitrary input matrices with mapping provided
+
+
+
+## 3.3. Arbitrary input matrices with mapping provided
 
 When studying multiple traits with arbitrary genotype matrices for different traits, 
 we also provide the interface for arbitrary genotype matrices with multiple phenotypes. 
diff --git a/vignettes/Input_Data_Format.Rmd b/vignettes/Input_Data_Format.Rmd
index 39ba734..e652281 100644
--- a/vignettes/Input_Data_Format.Rmd
+++ b/vignettes/Input_Data_Format.Rmd
@@ -2,7 +2,7 @@
 title: "Input Data Format"
 output: 
   bookdown::html_document2:
-    base_format: rmarkdown::html_vignette
+    # base_format: rmarkdown::html_vignette
     toc: true
     toc_depth: 2
     number_sections: true
@@ -26,7 +26,7 @@ library(colocboost)
 
 This vignette documents the standard input data formats of `colocboost`.
 
-## 1. Individual Level Data
+# 1. Individual Level Data
 
 For analyses using individual-level data, the basic format for single trait is as follows:
 
@@ -43,11 +43,11 @@ For example:
 `colocboost` also offers flexible input options (see detailed usage with different input formats, 
 refer to [Individual Level Data Colocalization](https://statfungen.github.io/colocboost/articles/Individual_Level_Colocalization.html).):
 
-- Single X matrix with N * P with Y matrix with N * L for L traits.
+- Single X matrix with $N \times P$ with Y matrix with $N \times L$ for $L$ traits.
 - Multiple X matrices and unmatched Y vectors with a mapping dictionary.
 
 
-## 2. Summary Statistics
+# 2. Summary Statistics
 
 For analyses using summary statistics, the basic format for single trait is as follows:
 
@@ -66,7 +66,7 @@ head(Sumstat_5traits$sumstat[[1]])
 
 The input format for multiple traits is similar, but `sumstat` should be a list of data frames `sumstat = list(sumstat1, sumstat2, sumstat3)`. 
 The flexibility of input format for multiple traits is as follows (see detailed usage with different input formats, 
-refer to [Summary Statistics Colocalization](https://statfungen.github.io/colocboost/articles/Summary_Level_Colocalization.html).)::
+refer to [Summary Statistics Colocalization](https://statfungen.github.io/colocboost/articles/Summary_Level_Colocalization.html).):
 
 - One LD matrix with a superset of variants in `sumstat` for all traits is allowed.
 - Multiple LD matrices, each corresponding to a different trait, are also allowed for the trait-specific LD structure.
@@ -74,10 +74,41 @@ refer to [Summary Statistics Colocalization](https://statfungen.github.io/colocb
 
 
 
-## 3. Optional: mapping between arbitary input $X$ and $Y$
+# 3. Optional: mapping between arbitary input $X$ and $Y$
 
-TO-DO-LIST
+For analysis when including multiple genotype matrices `X` with unmatched arbitary phenotype vectors `Y`, 
+a mapping dictionary `dict_YX` is required to indicate the relationship between `X` and `Y`.
+Similiarly, when multiple LD matrices with unmatched arbitrary multiple summary statistics `sumstat` are used,
+a mapping dictionary `dict_sumstatLD` is required to indicate the relationship between `sumstat` and `LD`.
 
-## 4. Hyprcoloc compatible format: effect size and standard error matrices
+For example, considering three genotype matrices `X = list(X1, X2, X3)` and 6 phenotype vectors `Y = list(Y1, Y2, Y3, Y4, Y5, Y6)`, where
+
+- `X1` is for trait 1, trait 2, trait 3
+- `X2` is for trait 4, trait 5
+- `X3` is for trait 6
+
+Then, you need to define a 6 by 2 matrix mapping dictionary `dict_YX` as follows: 
+
+- The first column should be `c(1,2,3,4,5,6)` for 6 traits. 
+- The second column should be `c(1,1,1,2,2,3)` for 3 genotype matrices.
+
+Here, each row indicates the trait index and the corresponding genotype matrix index.
+
+```{r dict_YX}
+dict_YX <- cbind(c(1,2,3,4,5,6), c(1,1,1,2,2,3))
+dict_YX
+```
+
+
+# 4. Hyprcoloc compatible format: effect size and standard error matrices
+
+ColocBoost also provides a flexibility to use Hyprcoloc compatible format for summary statistics with and without LD matrix.
+For example, when anaylze L traits for the same P variants with the specified effect size and standard error matrices:
+
+- `effect_est` (required) is $P \times L$ matrix of variable regression coefficients (i.e. regression beta values) in the genomic region.
+- `effect_se` (required) is $P \times L$ matrix of standard errors for the regression coefficients.
+- `effect_n` (highly recommended) is either a scalar or a vector of sample sizes for estimating regression coefficients.
+- `LD` (optional) is LD matrix for the $P$ variants. If it is not provided, it will apply LD-free ColocBoost.
+
+See more details about data format to implement LD-free ColocBoost and LD-mistmatch diagnosis in FIXME.
 
-TO-DO-LIST
diff --git a/vignettes/Summary_Statistics_Colocalization.Rmd b/vignettes/Summary_Statistics_Colocalization.Rmd
index a55e6f9..4d1dbe1 100644
--- a/vignettes/Summary_Statistics_Colocalization.Rmd
+++ b/vignettes/Summary_Statistics_Colocalization.Rmd
@@ -88,7 +88,9 @@ colocboost_plot(res)
 
 ### Results Interpretation
 
-For comprehensive tutorials on result interpretation and advanced visualization techniques, please visit our documentation portal at FIXME (link).
+For comprehensive tutorials on result interpretation and advanced visualization techniques, please visit our documentation portal 
+at [Visualization of ColocBoost Results](https://statfungen.github.io/colocboost/articles/Visualization_ColocBoost_Output.html) and
+[Interpret ColocBoost Output](https://statfungen.github.io/colocboost/articles/Interpret_ColocBoost_Output.html).
 
 
 # 3. Other summary statistics and LD input combinations

From 531c6e9af37910e5460f3598e54e99af149c766f Mon Sep 17 00:00:00 2001
From: xuewei cao <36172337+xueweic@users.noreply.github.com>
Date: Thu, 17 Apr 2025 13:50:21 -0400
Subject: [PATCH 3/3] minor fix

---
 R/colocboost_init.R                           | 29 +++++--------------
 vignettes/Individual_Level_Colocalization.Rmd | 29 ++++++++++---------
 2 files changed, 24 insertions(+), 34 deletions(-)

diff --git a/R/colocboost_init.R b/R/colocboost_init.R
index 6368dfd..1223ecd 100644
--- a/R/colocboost_init.R
+++ b/R/colocboost_init.R
@@ -625,30 +625,17 @@ process_individual_data <- function(X, Y, dict_YX, target_variants,
   }
   
   # Step 1: Update dictionary to handle duplicates samples
-  unique_count <- 0
-  new_dict <- integer(length(dict_YX))
   sample_lists <- lapply(Y, rownames)
-  for (i in 1:length(sample_lists)) {
-    # Check if we've seen this list before with the same matrix type
-    found <- FALSE
-    current_matrix_type <- dict_YX[i]
-    # Only check previous lists if i > 1
-    if (i > 1) {
-      for (j in 1:(i-1)) {
-        # Only compare if they're from the same matrix
-        if (dict_YX[j] == current_matrix_type && 
-            identical(sample_lists[[i]], sample_lists[[j]])) {
-          new_dict[i] <- new_dict[j]
-          found <- TRUE
-          break
-        }
+  new_dict <- 1:length(dict_YX)
+  # For each pair of Y indices
+  for (i in 1:(length(dict_YX)-1)) {
+    for (j in (i+1):length(dict_YX)) {
+      # Check if they map to the same X matrix AND have the same samples
+      if (dict_YX[i] == dict_YX[j] && identical(sample_lists[[i]], sample_lists[[j]])) {
+        # If same matrix and same samples, use the smaller index
+        new_dict[j] <- new_dict[i]
       }
     }
-    # If not seen before, assign new unique index
-    if (!found) {
-      unique_count <- unique_count + 1
-      new_dict[i] <- unique_count
-    }
   }
   
   # Step 2: Create result list
diff --git a/vignettes/Individual_Level_Colocalization.Rmd b/vignettes/Individual_Level_Colocalization.Rmd
index 74a89a4..b2a12f7 100644
--- a/vignettes/Individual_Level_Colocalization.Rmd
+++ b/vignettes/Individual_Level_Colocalization.Rmd
@@ -107,37 +107,40 @@ This is particularly useful when the same individuals are used for different tra
 
 ```{r single-x}
 # Extract a single SNP (as a vector)
-# X_single <- X[[1]]  # First SNP for all individuals
+X_single <- X[[1]]  # First SNP for all individuals
 
 # Run colocboost
-# res <- colocboost(X = X_single, Y = Y)
+res <- colocboost(X = X_single, Y = Y)
 
 # Identified CoS
-# res$cos_details$cos$cos_index
+res$cos_details$cos$cos_index
 ```
 
 
 ## 3.2. Genotype matrix is a superset of samples across different phenotypes
 
-When the LD matrix includes a superset of variants across different summary statistics, with **Input Format**:
+When the genotype matrix includes a superset of samples across different phenotypes, with **Input Format**:
 
-- `sumstat` is a list of data.frames for all traits
-- `LD` is a matrix of linkage disequilibrium (LD) information for all variants across all traits.
-- The LD matrix should contain all variants present in the summary statistics data frames.
-- This is particularly useful when you have a large LD matrix from a reference panel and want to use it for multiple summary statistics datasets. 
+- `X` is a matrix of genotype data for all individuals.
+- `Y` is a list of phenotype vectors for different outcomes.
+- Row names of both `X` and `Y` should be provided to match individuals - same format of individual id.
+- It is bettern if `X` contain all samples present in the phenotype vectors (optional).
+- This is particularly useful when you have a large genotype matrix and want to use it for multiple phenotypes with different samples. 
 It allows for efficient analysis without redundancy.
 
 
 ```{r superset-X}
-# Create sumstat with different number of variants - remove 100 variants in each sumstat
-# LD_superset <- LD
-# sumstat <- lapply(Sumstat_5traits$sumstat, function(x) x[-sample(1:nrow(x), 100), , drop = FALSE])
+# Create phenotype with different samples - remove 50 samples trait 1 and trait 3.
+X_superset <- X[[1]] 
+Y_remove <- Y
+Y_remove[[1]] <- Y[[1]][-sample(1:length(Y[[1]]),50), , drop=F]
+Y_remove[[3]] <- Y[[3]][-sample(1:length(Y[[3]]),50), , drop=F]
 
 # Run colocboost
-# res <- colocboost(sumstat = sumstat, LD = LD_superset)
+res <- colocboost(X = X_superset, Y = Y_remove)
 
 # Identified CoS
-# res$cos_details$cos$cos_index
+res$cos_details$cos$cos_index
 ```