diff --git a/NAMESPACE b/NAMESPACE index 9f3d3773..7dc90a82 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,6 +5,7 @@ export(QUAIL_rank_score_pipeline) export(adjust_susie_weights) export(align_variant_names) export(allele_qc) +export(auto_decision) export(batch_load_twas_weights) export(bayes_a_rss_weights) export(bayes_a_weights) @@ -28,6 +29,8 @@ export(ctwas_ld_loader) export(dentist) export(dentist_single_window) export(enet_weights) +export(extract_cs_info) +export(extract_top_pip_info) export(filter_invalid_summary_stat) export(filter_mixture_components) export(filter_variants_by_ld_reference) @@ -38,6 +41,7 @@ export(fsusie_wrapper) export(gbayes_rss) export(get_ctwas_meta_data) export(get_nested_element) +export(get_susie_result) export(glmnet_weights) export(harmonize_twas) export(lasso_weights) @@ -70,6 +74,7 @@ export(multicontext_ld_clumping) export(multigene_udr) export(multivariate_analysis_pipeline) export(mvsusie_weights) +export(parse_cs_corr) export(parse_region) export(parse_snp_info) export(parse_variant_id) @@ -107,6 +112,7 @@ export(twas_z) export(univariate_analysis_pipeline) export(wald_test_pval) export(xqtl_enrichment_wrapper) +export(z_to_pvalue) import(vroom) importFrom(IRanges,IRanges) importFrom(IRanges,end) @@ -120,6 +126,7 @@ importFrom(bigstatsr,FBM.code256) importFrom(coloc,coloc.bf_bf) importFrom(data.table,as.data.table) importFrom(data.table,fread) +importFrom(data.table,setDT) importFrom(data.table,setnames) importFrom(doFuture,registerDoFuture) importFrom(dplyr,"%>%") @@ -127,6 +134,7 @@ importFrom(dplyr,across) importFrom(dplyr,arrange) importFrom(dplyr,as_tibble) importFrom(dplyr,bind_rows) +importFrom(dplyr,case_when) importFrom(dplyr,distinct) importFrom(dplyr,everything) importFrom(dplyr,filter) diff --git a/R/LD.R b/R/LD.R index 6350b13e..426790ff 100644 --- a/R/LD.R +++ b/R/LD.R @@ -348,9 +348,6 @@ load_LD_matrix <- function(LD_meta_file_path, region, extract_coordinates = NULL # Use region chromosome as fallback for empty blocks block_chroms[j] <- as.character(intersected_LD_files$region$chrom) } - - # Remove large objects to free memory - rm(LD_matrix_processed, extracted_LD_list) } # Create combined LD matrix @@ -494,7 +491,6 @@ partition_LD_matrix <- function(ld_data, merge_small_blocks = TRUE, # Partition the matrix based on block metadata result <- extract_block_matrices(combined_matrix, block_metadata, variant_ids) - return(result) } @@ -708,6 +704,7 @@ extract_block_matrices <- function(matrix, block_metadata, variant_ids) { stringsAsFactors = FALSE ) variant_mapping <- rbind(variant_mapping, block_mapping) + } return(list( diff --git a/R/file_utils.R b/R/file_utils.R index f0c8ce1e..94533314 100644 --- a/R/file_utils.R +++ b/R/file_utils.R @@ -489,6 +489,11 @@ load_regional_association_data <- function(genotype, # PLINK file )) } +#' Load Regional Univariate Association Data +#' +#' This function loads regional association data for univariate analysis. +#' It includes residual matrices, original genotype data, and additional metadata. +#' #' @importFrom matrixStats colVars #' @return A list #' @export @@ -508,6 +513,11 @@ load_regional_univariate_data <- function(...) { )) } +#' Load Regional Data for Regression Modeling +#' +#' This function loads regional association data formatted for regression modeling. +#' It includes phenotype, genotype, and covariate matrices along with metadata. +#' #' @return A list #' @export load_regional_regression_data <- function(...) { @@ -543,6 +553,11 @@ pheno_list_to_mat <- function(data_list) { return(data_list) } +#' Load and Preprocess Regional Multivariate Data +#' +#' This function loads regional association data and processes it into a multivariate format. +#' It optionally filters out samples based on missingness thresholds in the response matrix. +#' #' @importFrom matrixStats colVars #' @return A list #' @export @@ -578,6 +593,10 @@ load_regional_multivariate_data <- function(matrix_y_min_complete = NULL, # when )) } +#' Load Regional Functional Association Data +#' +#' This function loads precomputed regional functional association data. +#' #' @return A list #' @export load_regional_functional_data <- function(...) { diff --git a/R/misc.R b/R/misc.R index 3e9ecdee..8f9e5a7f 100644 --- a/R/misc.R +++ b/R/misc.R @@ -602,6 +602,36 @@ z_to_beta_se <- function(z, maf, n) { return(data.frame(beta = beta, se = se, maf = p)) } +#' Convert Z-scores to P-values +#' +#' This function calculates p-values from given z-scores using a two-tailed normal distribution. +#' It supports vector input to process multiple z-scores simultaneously. +#' +#' @param z Numeric vector. The z-scores to be converted to p-values. +#' +#' @return A numeric vector of p-values corresponding to the input z-scores. +#' +#' @details +#' The function uses the following formula to calculate p-values: +#' p-value = 2 * Φ(-|z|) +#' Where Φ is the cumulative distribution function of the standard normal distribution. +#' +#' @examples +#' z <- c(2.5, -1.8, 3.2, 0.7) +#' pvalues <- z_to_pvalue(z) +#' print(pvalues) +#' +#' @note +#' This function assumes that the input z-scores are from a two-tailed test and +#' are normally distributed. It calculates two-sided p-values. +#' For extremely large absolute z-scores, the resulting p-values may be computed as zero +#' due to floating-point limitations in R. This occurs when the absolute z-score > 37. +#' +#' @export +z_to_pvalue <- function(z) { + 2 * pnorm(-abs(z)) +} + #' Filter events based on provided context name pattern #' #' @param events A character vector of event names diff --git a/R/quail_vqtl.R b/R/quail_vqtl.R index 9ba03d92..7649c624 100644 --- a/R/quail_vqtl.R +++ b/R/quail_vqtl.R @@ -8,11 +8,10 @@ #' @param return_residuals Logical, whether to return residuals (default: FALSE). #' @return A list containing regression results: \code{betahat}, \code{sebetahat}, \code{z_scores}, \code{p_values}, and \code{q_values}. #' @examples -#' @noRd #' X <- matrix(rnorm(1000), 100, 10) #' y <- rnorm(100) #' results <- univariate_regression(X, y) - +#' @noRd univariate_regression <- function(X, y, Z = NULL, center = TRUE, scale = FALSE, return_residuals = FALSE) { y_na <- which(is.na(y)) @@ -96,10 +95,10 @@ univariate_regression <- function(X, y, Z = NULL, center = TRUE, #' @param covariates Optional numeric matrix of covariates (n x k), where k is the number of covariates. #' @return A data frame containing regression results for each SNP, including \code{BETA}, \code{SE}, \code{Z}, \code{P}, and \code{Q}. #' @examples -#' @noRd #' genotype <- matrix(rbinom(1000 * 10, 2, 0.3), 1000, 10) #' phenotype <- rnorm(1000) #' results <- run_linear_regression1(genotype, phenotype) +#' @noRd run_linear_regression <- function(genotype, phenotype, covariates = NULL) { if (!is.null(covariates)) { covariates <- as.data.frame(lapply(covariates, as.numeric)) diff --git a/R/quantile_twas_weight.R b/R/quantile_twas_weight.R index 8f722779..dbcd1b88 100644 --- a/R/quantile_twas_weight.R +++ b/R/quantile_twas_weight.R @@ -412,7 +412,6 @@ corr_filter <- function(X, cor_thres = 0.8) { #' @param response Optional response vector for "response_correlation" strategy #' @param max_iterations Maximum number of iterations to attempt removing problematic columns #' @return Cleaned matrix X with problematic columns removed -#' @importFrom stats qr #' @noRd check_remove_highcorr_snp <- function(X = X, C = C, strategy = c("correlation", "variance", "response_correlation"), response = NULL, max_iterations = 300, corr_thresholds = seq(0.75, 0.5, by = -0.05)) { strategy <- match.arg(strategy) diff --git a/R/raiss.R b/R/raiss.R index 73fd06c3..278292fa 100644 --- a/R/raiss.R +++ b/R/raiss.R @@ -28,7 +28,7 @@ raiss_single_matrix <- function(ref_panel, known_zscores, LD_matrix, lamb = 0.01 knowns_id <- intersect(known_zscores$variant_id, ref_panel$variant_id) knowns <- which(ref_panel$variant_id %in% knowns_id) unknowns <- which(!ref_panel$variant_id %in% knowns_id) - + # Handle edge cases if (length(knowns) == 0) { if (verbose) message("No known variants found, cannot perform imputation.") @@ -51,10 +51,8 @@ raiss_single_matrix <- function(ref_panel, known_zscores, LD_matrix, lamb = 0.01 # Call raiss_model results <- raiss_model(zt, sig_t, sig_i_t, lamb, rcond) - # Format the results results <- format_raiss_df(results, ref_panel, unknowns) - # Filter output results <- filter_raiss_output(results, R2_threshold, minimum_ld, verbose) @@ -72,7 +70,6 @@ raiss_single_matrix <- function(ref_panel, known_zscores, LD_matrix, lamb = 0.01 } else { as.matrix(LD_matrix) } - # Return results return(list( result_nofilter = result_nofilter, @@ -193,7 +190,6 @@ raiss <- function(ref_panel, known_zscores, LD_matrix, lamb = 0.01, rcond = 0.01 lamb, rcond, R2_threshold, minimum_ld, verbose = FALSE ) - # Skip if block returned NULL (no known variants) if (!is.null(block_result)) { results_list[[block_id]] <- block_result @@ -346,14 +342,26 @@ filter_raiss_output <- function(zscores, R2_threshold = 0.6, minimum_ld = 5, ver # Print report if (verbose) { + max_label_length <- max(nchar(c( + "Variants before filter:", + "Non-imputed variants:", + "Imputed variants:", + "Variants filtered because of low LD score:", + "Variants filtered because of low R2:", + "Remaining variants after filter:" + ))) + + format_line <- function(label, value) { + sprintf("%-*s %d", max_label_length, paste0(label, ":"), value) + } + message("IMPUTATION REPORT\n") - message("Number of SNPs:\n") - message("before filter:", NSNPs_bf_filt, "\n") - message("not imputed:", NSNPs_initial, "\n") - message("imputed:", NSNPs_imputed, "\n") - message("filtered because of LD:", NSNPs_ld_filt, "\n") - message("filtered because of R2:", NSNPs_R2_filt, "\n") - message("after filter:", NSNPs_af_filt, "\n") + message(format_line("Variants before filter", NSNPs_bf_filt)) + message(format_line("Non-imputed variants", NSNPs_initial)) + message(format_line("Imputed variants", NSNPs_imputed)) + message(format_line("Variants filtered because of low LD score", NSNPs_ld_filt)) + message(format_line("Variants filtered because of low R2", NSNPs_R2_filt)) + message(format_line("Remaining variants after filter", NSNPs_af_filt)) } return(zscore_list = list(zscores_nofilter = zscores_nofilter, zscores = zscores)) } diff --git a/R/regularized_regression.R b/R/regularized_regression.R index b8a84e11..2d21335a 100644 --- a/R/regularized_regression.R +++ b/R/regularized_regression.R @@ -437,6 +437,12 @@ enet_weights <- function(X, y) glmnet_weights(X, y, 0.5) #' @export lasso_weights <- function(X, y) glmnet_weights(X, y, 1) +#' Compute Weights Using mr.ash Shrinkage +#' +#' This function fits the `mr.ash` model (adaptive shrinkage regression) to estimate weights +#' for a given set of predictors and response. It uses optional prior standard deviation initialization +#' and can accept custom initial beta values. +#' #' @examples #' wgt.mr.ash <- mrash_weights(eqtl$X, eqtl$y_res, beta.init = lasso_weights(X, y)) #' @importFrom stats predict diff --git a/R/univariate_pipeline.R b/R/univariate_pipeline.R index a80ba03c..6826ad1a 100644 --- a/R/univariate_pipeline.R +++ b/R/univariate_pipeline.R @@ -183,7 +183,7 @@ rss_analysis_pipeline <- function( coverage = c(0.95, 0.7, 0.5), signal_cutoff = 0.025 ), impute = TRUE, impute_opts = list(rcond = 0.01, R2_threshold = 0.6, minimum_ld = 5, lamb = 0.01), - pip_cutoff_to_skip = 0, remove_indels = FALSE, comment_string = "#") { + pip_cutoff_to_skip = 0, remove_indels = FALSE, comment_string = "#", diagnostics = FALSE) { res <- list() rss_input <- load_rss_data( sumstat_path = sumstat_path, column_file_path = column_file_path, @@ -200,7 +200,6 @@ rss_analysis_pipeline <- function( if (nrow(sumstats)==0){ return(list(rss_data_analyzed = sumstats)) } - # Preprocess the input data preprocess_results <- rss_basic_qc(sumstats, LD_data, skip_region = skip_region, remove_indels = remove_indels) sumstats <- preprocess_results$sumstats @@ -219,14 +218,12 @@ rss_analysis_pipeline <- function( message(paste("Follow-up on region because signals above PIP threshold", pip_cutoff_to_skip, "were detected in initial model screening.")) } } - # Perform quality control if (!is.null(qc_method)) { qc_results <- summary_stats_qc(sumstats, LD_data, n = n, var_y = var_y, method = qc_method) sumstats <- qc_results$sumstats LD_mat <- qc_results$LD_mat } - # Perform imputation if (impute) { LD_matrix <- partition_LD_matrix(LD_data) @@ -234,7 +231,6 @@ rss_analysis_pipeline <- function( sumstats <- impute_results$result_filter LD_mat <- impute_results$LD_mat } - # Perform fine-mapping if (!is.null(finemapping_method)) { pri_coverage <- finemapping_opts$coverage[1] @@ -252,14 +248,112 @@ rss_analysis_pipeline <- function( } } if (impute & !is.null(qc_method)) { - method_name <- paste0(toupper(qc_method), "_RAISS_imputed") + method_name <- paste0(finemapping_method, "_", toupper(qc_method), "_RAISS_imputed") } else if (!impute & !is.null(qc_method)) { - method_name <- toupper(qc_method) + method_name <- paste0(finemapping_method, "_", toupper(qc_method)) } else { - method_name <- "NO_QC" + method_name <- paste0(finemapping_method, "_", "NO_QC") } result_list <- list() result_list[[method_name]] <- res result_list[["rss_data_analyzed"]] <- sumstats + + block_cs_metrics <- list() + if (diagnostics) { + if (length(res) > 0) { + bvsr_res = get_susie_result(res) + bvsr_cs_num = if(!is.null(bvsr_res)) length(bvsr_res$sets$cs) else NULL + if (isTRUE(bvsr_cs_num > 0)) { # have CS + # Get the names of the credible sets + cs_names_bvsr = names(bvsr_res$sets$cs) + block_cs_metrics = extract_cs_info(con_data = res, cs_names = cs_names_bvsr, top_loci_table = res$top_loci) + } else { # no CS + if (sum(bvsr_res$pip > finemapping_opts$signal_cutoff) > 0) { + block_cs_metrics = extract_top_pip_info(res) + } + } + } + # sensitive check for additional analyses + if (!is.null(block_cs_metrics) && length(block_cs_metrics) > 0) { + block_cs_metrics = parse_cs_corr(block_cs_metrics) + cs_row = block_cs_metrics %>% filter(!is.na(block_cs_metrics$variants_per_cs)) + if (nrow(cs_row)>1) {# CS > 1 + block_cs_metrics <- block_cs_metrics %>% + mutate(max_cs_corr_study_block = if(all(is.na(cs_corr_max))) { + NA_real_ + } else { + max(cs_corr_max, na.rm = TRUE) + }) + if (any(block_cs_metrics$p_value > 1e-4 | block_cs_metrics$max_cs_corr_study_block > 0.5)) { + finemapping_method <- "bayesian_conditional_regression" + pri_coverage <- finemapping_opts$coverage[1] + sec_coverage <- if (length(finemapping_opts$coverage) > 1) finemapping_opts$coverage[-1] else NULL + bcr <- susie_rss_pipeline(sumstats, LD_mat, + n = n, var_y = var_y, + L = finemapping_opts$init_L, max_L = finemapping_opts$max_L, l_step = finemapping_opts$l_step, + analysis_method = finemapping_method, + coverage = pri_coverage, + secondary_coverage = sec_coverage, + signal_cutoff = finemapping_opts$signal_cutoff + ) + if (!is.null(qc_method)) { + bcr$outlier_number <- qc_results$outlier_number + } + if (impute & !is.null(qc_method)) { + method_name <- paste0(finemapping_method, "_", toupper(qc_method), "_RAISS_imputed") + } else if (!impute & !is.null(qc_method)) { + method_name <- paste0(finemapping_method, "_", toupper(qc_method)) + } else { + method_name <- paste0(finemapping_method, "_", "NO_QC") + } + result_list[[method_name]] <- bcr + finemapping_method <- "single_effect" + sumstats <- preprocess_results$sumstats + LD_mat <- preprocess_results$LD_mat + ser <- susie_rss_pipeline(sumstats, LD_mat, + n = n, var_y = var_y, + L = finemapping_opts$init_L, max_L = finemapping_opts$max_L, l_step = finemapping_opts$l_step, + analysis_method = finemapping_method, + coverage = pri_coverage, + secondary_coverage = sec_coverage, + signal_cutoff = finemapping_opts$signal_cutoff + ) + qc_method <- NULL + impute <- FALSE + if (impute & !is.null(qc_method)) { + method_name <- paste0(finemapping_method, "_", toupper(qc_method), "_RAISS_imputed") + } else if (!impute & !is.null(qc_method)) { + method_name <- paste0(finemapping_method, "_", toupper(qc_method)) + } else { + method_name <- paste0(finemapping_method, "_", "NO_QC") + } + result_list[[method_name]] <- ser + } + } else { # CS = 1 or NA + finemapping_method <- "single_effect" + sumstats <- preprocess_results$sumstats + LD_mat <- preprocess_results$LD_mat + ser <- susie_rss_pipeline(sumstats, LD_mat, + n = n, var_y = var_y, + L = finemapping_opts$init_L, max_L = finemapping_opts$max_L, l_step = finemapping_opts$l_step, + analysis_method = finemapping_method, + coverage = pri_coverage, + secondary_coverage = sec_coverage, + signal_cutoff = finemapping_opts$signal_cutoff + ) + qc_method <- NULL + impute <- FALSE + if (impute & !is.null(qc_method)) { + method_name <- paste0(finemapping_method, "_", toupper(qc_method), "_RAISS_imputed") + } else if (!impute & !is.null(qc_method)) { + method_name <- paste0(finemapping_method, "_", toupper(qc_method)) + } else { + method_name <- paste0(finemapping_method, "_", "NO_QC") + } + result_list[[method_name]] <- ser + } + result_list[["diagnostics"]] <- block_cs_metrics + } + } return(result_list) } diff --git a/R/univariate_rss_diagnostics.R b/R/univariate_rss_diagnostics.R new file mode 100644 index 00000000..271b6256 --- /dev/null +++ b/R/univariate_rss_diagnostics.R @@ -0,0 +1,307 @@ +#' Extract SuSiE Results from Finemapping Data +#' +#' This function extracts the trimmed SuSiE results from a finemapping data object, +#' typically obtained from a finemapping RDS file. It's designed to work with +#' the method layer of these files, often named as 'method_RAISS_imputed', 'method', +#' or 'method_NO_QC'. This layer is right under the study layer. +#' +#' @param con_data List. The method layer data from a finemapping RDS file. +#' +#' @return The trimmed SuSiE results (`$susie_result_trimmed`) if available, +#' otherwise NULL. +#' +#' @details +#' The function checks if the input data is empty or if the `$susie_result_trimmed` +#' element is missing. It returns NULL in these cases. If `$susie_result_trimmed` +#' exists and is not empty, it returns this element. +#' +#' @note +#' This function is particularly useful when working with large datasets +#' where not all method layers may contain valid SuSiE results or method layer. +#' +#' @export +get_susie_result <- function(con_data) { + if (length(con_data) == 0) return(NULL) + if (length(con_data$susie_result_trimmed) == 0) { + return(NULL) + print(paste("$susie_result_trimmed is null for", con_data)) + } else { + return(con_data$susie_result_trimmed) + } +} + +#' Process Credible Sets (CS) from Finemapping Results +#' +#' This function extracts and processes information for each Credible Set (CS) +#' from finemapping results, typically obtained from a finemapping RDS file. +#' +#' @param con_data List. The method layer data from a finemapping RDS file that is not empty. +#' @param cs_names Character vector. Names of the Credible Sets, usually in the format "L_". +#' @param top_loci_table Data frame. The $top_loci layer data from the finemapping results. +#' +#' @return A data frame with one row per CS, containing the following columns: +#' \item{cs_name}{Name of the Credible Set} +#' \item{variants_per_cs}{Number of variants in the CS} +#' \item{top_variant}{ID of the variant with the highest PIP in the CS} +#' \item{top_variant_index}{Global index of the top variant} +#' \item{top_pip}{Highest Posterior Inclusion Probability (PIP) in the CS} +#' \item{top_z}{Z-score of the top variant} +#' \item{p_value}{P-value calculated from the top Z-score} +#' \item{cs_corr}{Pairwise correlations of other CSs in this RDS with the CS of +#' the current row, delimited by '|', if there is more than one CS in this RDS file} +#' +#' @details +#' This function is designed to be used only when there is at least one Credible Set +#' in the finemapping results usually for a given study and block. It processes each CS, +#' extracting key information such as the top variant, its statistics, and +#' correlation information between multiple CS if available. +#' +#' @importFrom purrr map +#' @importFrom dplyr bind_rows +#' +#' @export +extract_cs_info <- function(con_data, cs_names, top_loci_table) { + results <- map(seq_along(cs_names), function(i) { + cs_name <- cs_names[i] + indices <- con_data$susie_result_trimmed$sets$cs[[cs_name]] + + # Get variants for this CS using the full variant_names list + cs_variants <- con_data$variant_names[indices] + cs_data <- top_loci_table[top_loci_table$variant_id %in% cs_variants, ] + top_row <- which.max(cs_data$pip) + + top_variant <- cs_data$variant_id[top_row] + # Find the global index of the top variant + top_variant_global_index = which(con_data$variant_names == top_variant) + top_pip <- cs_data$pip[top_row] + top_z <- cs_data$z[top_row] + p_value <- z_to_pvalue(top_z) + + # Extract cs_corr + cs_corr <- if (length(cs_names) > 1) { + con_data$susie_result_trimmed$cs_corr[i,] + } else { + NA # Use NA for the second CS or when there's only one CS + } + + # Return results for this CS as a one-row data.frame + result = tibble::tibble( + cs_name = cs_name, + variants_per_cs = length(cs_variants), + top_variant = top_variant, + top_variant_index = top_variant_global_index, + top_pip = top_pip, + top_z = top_z, + p_value = p_value, + cs_corr = list(paste(cs_corr, collapse = ",")) # list column if cs_corr is a vector + ) + return(result) + }) + # Combine all tibbles into one data frame + final_result <- dplyr::bind_rows(results) + return(final_result) +} + +#' Extract Information for Top Variant from Finemapping Results +#' +#' This function extracts information about the variant with the highest Posterior +#' Inclusion Probability (PIP) from finemapping results, typically used when no +#' Credible Sets (CS) are identified in the analysis. +#' +#' @param con_data List. The method layer data from a finemapping RDS file. +#' +#' @return A data frame with one row containing the following columns: +#' \item{cs_name}{NA (as no CS is identified)} +#' \item{variants_per_cs}{NA (as no CS is identified)} +#' \item{top_variant}{ID of the variant with the highest PIP} +#' \item{top_variant_index}{Index of the top variant in the original data} +#' \item{top_pip}{Highest Posterior Inclusion Probability (PIP)} +#' \item{top_z}{Z-score of the top variant} +#' \item{p_value}{P-value calculated from the top Z-score} +#' \item{cs_corr}{NA (as no CS correlation is available)} +#' +#' @details +#' This function is designed to be used when no Credible Sets are identified in +#' the finemapping results, but information about the most significant variant +#' is still desired. It identifies the variant with the highest PIP and extracts +#' relevant statistical information. +#' +#' @note +#' This function is particularly useful for capturing information about potentially +#' important variants that might be included in Credible Sets under different +#' analysis parameters or lower coverage. It maintains a structure similar to +#' the output of `extract_cs_info()` for consistency in downstream analyses. +#' +#' @seealso +#' \code{\link{extract_cs_info}} for processing when Credible Sets are present. +#' +#' @export +extract_top_pip_info <- function(con_data) { + # Find the variant with the highest PIP + top_pip_index <- which.max(con_data$susie_result_trimmed$pip) + top_pip <- con_data$susie_result_trimmed$pip[top_pip_index] + top_variant <- con_data$variant_names[top_pip_index] + top_z <- con_data$sumstats$z[top_pip_index] + p_value <- z_to_pvalue(top_z) + + list( + cs_name = NA, + variants_per_cs = NA, + top_variant = top_variant, + top_variant_index = top_pip_index, + top_pip = top_pip, + top_z = top_z, + p_value = p_value, + cs_corr = NA # or NULL + ) +} + +#' Parse Credible Set Correlations from extract_cs_info() Output +#' +#' This function takes the output from `extract_cs_info()` and expands the `cs_corr` column +#' into multiple columns, preserving the original order of correlations. It also +#' calculates maximum and minimum correlation values for each Credible Set. +#' +#' @param df Data frame or data.table. The output from `extract_cs_info()` function, +#' containing a `cs_corr` column with correlation information. +#' +#' @return A data.table with the original columns from the input, plus: +#' \item{cs_corr_1, cs_corr_2, ...}{Individual correlation values, with column names +#' based on their position in the original string} +#' \item{cs_corr_max}{Maximum absolute correlation value (excluding 1)} +#' \item{cs_corr_min}{Minimum absolute correlation value} +#' +#' @details +#' The function splits the `cs_corr` column, which typically contains correlation +#' values separated by '|', into individual columns. It preserves the order of +#' these correlations, allowing for easy interpretation in a matrix-like format. +#' +#' @note +#' - This function converts the input to a data.table if it isn't already one. +#' - It handles cases where correlation values might be missing or not in the expected format. +#' - The function assumes that correlation values of 1 represent self-correlations and excludes +#' these when calculating max and min correlations. +#' +#' @importFrom data.table setDT +#' +#' @export +parse_cs_corr <- function(df) { + # Convert to data.table if not already + if (!is.data.table(df)) { + setDT(df) + } + + extract_correlations <- function(x) { + # Early return if x is invalid + if(is.na(x) || x == "" || is.null(x) || !grepl(",", as.character(x))) { + return(list(values = numeric(0), max_corr = NA_real_, min_corr = NA_real_)) + } + + # Convert and filter values + values <- as.numeric(unlist(strsplit(x, ","))) + values_filtered <- abs(values[values != 1]) + + # Return list with NA if no valid correlations + list( + values = values, + max_corr = if(length(values_filtered) > 0) max(abs(values_filtered), na.rm = TRUE) else NA_real_, + min_corr = if(length(values_filtered) > 0) min(abs(values_filtered), na.rm = TRUE) else NA_real_ + ) + } + # Process correlations + processed_results <- lapply(df$cs_corr, extract_correlations) + # If no valid results, add NA columns and return + if(all(sapply(processed_results, function(x) length(x$values) == 0))) { + df[, c("cs_corr_max", "cs_corr_min") := list(NA_real_, NA_real_)] + return(df) + } + + # Determine max number of correlations + max_corr_count <- max(sapply(processed_results, function(x) length(x$values))) + + # Create and add correlation columns + col_names <- paste0("cs_corr_", 1:max_corr_count) + + for(i in seq_along(col_names)) { + df[, (col_names[i]) := sapply(processed_results, function(x) { + if(length(x$values) >= i) x$values[i] else NA_real_ + })] + } + + # Add max and min correlation columns + df[, `:=`( + cs_corr_max = sapply(processed_results, `[[`, "max_corr"), + cs_corr_min = sapply(processed_results, `[[`, "min_corr") + )] + + return(df) +} + +#' Process Credible Set Information and Determine Updating Strategy +#' +#' This function categorizes Credible Sets (CS) within a study block into different +#' updating strategies based on their statistical properties and correlations. +#' +#' @param df Data frame. Contains information about Credible Sets for a specific study and block. +#' @param high_corr_cols Character vector. Names of columns in df that represent high correlations. +#' +#' @return A modified data frame with additional columns attached to the diagnostic table: +#' \item{top_cs}{Logical. TRUE for the CS with the highest absolute Z-score.} +#' \item{tagged_cs}{Logical. TRUE for CS that are considered "tagged" based on p-value and correlation criteria.} +#' \item{method}{Character. The determined updating strategy ("BVSR", "SER", or "BCR").} +#' +#' @details +#' This function performs the following steps: +#' 1. Identifies the top CS based on the highest absolute Z-score. +#' 2. Identifies tagged CS based on high p-value and high correlations. +#' 3. Counts total, tagged, and remaining CS. +#' 4. Determines the appropriate updating method based on these counts. +#' +#' The updating methods are: +#' - BVSR (Bayesian Variable Selection Regression): Used when there's only one CS or all CS are accounted for. +#' - SER (Single Effect Regression): Used when there are tagged CS but no remaining untagged CS. +#' - BCR (Bayesian Conditional Regression): Used when there are remaining untagged CS. +#' +#' @note +#' This function is part of a developing methodology for automatically handling +#' finemapping results. The thresholds and criteria used (e.g., p-value > 1e-4 for tagging) +#' are subject to refinement and may change in future versions. +#' +#' @importFrom dplyr case_when +#' +#' @export +auto_decision <- function(df, high_corr_cols) { + # Identify top_cs + top_cs_index <- which.max(abs(df$top_z)) + df$top_cs <- FALSE + df$top_cs[top_cs_index] <- TRUE + + # Identify tagged_cs + df$tagged_cs <- sapply(1:nrow(df), function(i) { + if (df$top_cs[i]) return(FALSE) + if (df$p_value[i] > 1e-4) return(TRUE) + if (length(high_corr_cols) == 0) return(FALSE) + any(sapply(high_corr_cols, function(col) df[i, ..col] == 1)) + }) + + # Count total and remaining CS + total_cs <- nrow(df) + print("total_cs") + print(total_cs) + tagged_cs_count <- sum(df$tagged_cs) + if (total_cs > 0) { + remaining_cs <- total_cs - 1 - tagged_cs_count + } else { + remaining_cs <- 0 + } + # Determine method + df$method <- case_when( + tagged_cs_count == 0 & total_cs > 1 ~ "BVSR", + (remaining_cs == 0 & total_cs > 1) | (total_cs == 1) ~ "SER", + remaining_cs > 0 ~ "BCR", + TRUE ~ NA_character_ +) + + + return(df) +} \ No newline at end of file diff --git a/man/auto_decision.Rd b/man/auto_decision.Rd new file mode 100644 index 00000000..d9dbc260 --- /dev/null +++ b/man/auto_decision.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/univariate_rss_diagnostics.R +\name{auto_decision} +\alias{auto_decision} +\title{Process Credible Set Information and Determine Updating Strategy} +\usage{ +auto_decision(df, high_corr_cols) +} +\arguments{ +\item{df}{Data frame. Contains information about Credible Sets for a specific study and block.} + +\item{high_corr_cols}{Character vector. Names of columns in df that represent high correlations.} +} +\value{ +A modified data frame with additional columns attached to the diagnostic table: + \item{top_cs}{Logical. TRUE for the CS with the highest absolute Z-score.} + \item{tagged_cs}{Logical. TRUE for CS that are considered "tagged" based on p-value and correlation criteria.} + \item{method}{Character. The determined updating strategy ("BVSR", "SER", or "BCR").} +} +\description{ +This function categorizes Credible Sets (CS) within a study block into different +updating strategies based on their statistical properties and correlations. +} +\details{ +This function performs the following steps: +1. Identifies the top CS based on the highest absolute Z-score. +2. Identifies tagged CS based on high p-value and high correlations. +3. Counts total, tagged, and remaining CS. +4. Determines the appropriate updating method based on these counts. + +The updating methods are: +- BVSR (Bayesian Variable Selection Regression): Used when there's only one CS or all CS are accounted for. +- SER (Single Effect Regression): Used when there are tagged CS but no remaining untagged CS. +- BCR (Bayesian Conditional Regression): Used when there are remaining untagged CS. +} +\note{ +This function is part of a developing methodology for automatically handling +finemapping results. The thresholds and criteria used (e.g., p-value > 1e-4 for tagging) +are subject to refinement and may change in future versions. +} diff --git a/man/extract_cs_info.Rd b/man/extract_cs_info.Rd new file mode 100644 index 00000000..fda919b6 --- /dev/null +++ b/man/extract_cs_info.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/univariate_rss_diagnostics.R +\name{extract_cs_info} +\alias{extract_cs_info} +\title{Process Credible Sets (CS) from Finemapping Results} +\usage{ +extract_cs_info(con_data, cs_names, top_loci_table) +} +\arguments{ +\item{con_data}{List. The method layer data from a finemapping RDS file that is not empty.} + +\item{cs_names}{Character vector. Names of the Credible Sets, usually in the format "L_".} + +\item{top_loci_table}{Data frame. The $top_loci layer data from the finemapping results.} +} +\value{ +A data frame with one row per CS, containing the following columns: + \item{cs_name}{Name of the Credible Set} + \item{variants_per_cs}{Number of variants in the CS} + \item{top_variant}{ID of the variant with the highest PIP in the CS} + \item{top_variant_index}{Global index of the top variant} + \item{top_pip}{Highest Posterior Inclusion Probability (PIP) in the CS} + \item{top_z}{Z-score of the top variant} + \item{p_value}{P-value calculated from the top Z-score} + \item{cs_corr}{Pairwise correlations of other CSs in this RDS with the CS of + the current row, delimited by '|', if there is more than one CS in this RDS file} +} +\description{ +This function extracts and processes information for each Credible Set (CS) +from finemapping results, typically obtained from a finemapping RDS file. +} +\details{ +This function is designed to be used only when there is at least one Credible Set +in the finemapping results usually for a given study and block. It processes each CS, +extracting key information such as the top variant, its statistics, and +correlation information between multiple CS if available. +} diff --git a/man/extract_top_pip_info.Rd b/man/extract_top_pip_info.Rd new file mode 100644 index 00000000..ce2ee41d --- /dev/null +++ b/man/extract_top_pip_info.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/univariate_rss_diagnostics.R +\name{extract_top_pip_info} +\alias{extract_top_pip_info} +\title{Extract Information for Top Variant from Finemapping Results} +\usage{ +extract_top_pip_info(con_data) +} +\arguments{ +\item{con_data}{List. The method layer data from a finemapping RDS file.} +} +\value{ +A data frame with one row containing the following columns: + \item{cs_name}{NA (as no CS is identified)} + \item{variants_per_cs}{NA (as no CS is identified)} + \item{top_variant}{ID of the variant with the highest PIP} + \item{top_variant_index}{Index of the top variant in the original data} + \item{top_pip}{Highest Posterior Inclusion Probability (PIP)} + \item{top_z}{Z-score of the top variant} + \item{p_value}{P-value calculated from the top Z-score} + \item{cs_corr}{NA (as no CS correlation is available)} +} +\description{ +This function extracts information about the variant with the highest Posterior +Inclusion Probability (PIP) from finemapping results, typically used when no +Credible Sets (CS) are identified in the analysis. +} +\details{ +This function is designed to be used when no Credible Sets are identified in +the finemapping results, but information about the most significant variant +is still desired. It identifies the variant with the highest PIP and extracts +relevant statistical information. +} +\note{ +This function is particularly useful for capturing information about potentially +important variants that might be included in Credible Sets under different +analysis parameters or lower coverage. It maintains a structure similar to +the output of `extract_cs_info()` for consistency in downstream analyses. +} +\seealso{ +\code{\link{extract_cs_info}} for processing when Credible Sets are present. +} diff --git a/man/get_susie_result.Rd b/man/get_susie_result.Rd new file mode 100644 index 00000000..e1895924 --- /dev/null +++ b/man/get_susie_result.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/univariate_rss_diagnostics.R +\name{get_susie_result} +\alias{get_susie_result} +\title{Extract SuSiE Results from Finemapping Data} +\usage{ +get_susie_result(con_data) +} +\arguments{ +\item{con_data}{List. The method layer data from a finemapping RDS file.} +} +\value{ +The trimmed SuSiE results (`$susie_result_trimmed`) if available, +otherwise NULL. +} +\description{ +This function extracts the trimmed SuSiE results from a finemapping data object, +typically obtained from a finemapping RDS file. It's designed to work with +the method layer of these files, often named as 'method_RAISS_imputed', 'method', +or 'method_NO_QC'. This layer is right under the study layer. +} +\details{ +The function checks if the input data is empty or if the `$susie_result_trimmed` +element is missing. It returns NULL in these cases. If `$susie_result_trimmed` +exists and is not empty, it returns this element. +} +\note{ +This function is particularly useful when working with large datasets +where not all method layers may contain valid SuSiE results or method layer. +} diff --git a/man/load_regional_functional_data.Rd b/man/load_regional_functional_data.Rd new file mode 100644 index 00000000..8c66aa6f --- /dev/null +++ b/man/load_regional_functional_data.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/file_utils.R +\name{load_regional_functional_data} +\alias{load_regional_functional_data} +\title{Load Regional Functional Association Data} +\usage{ +load_regional_functional_data(...) +} +\value{ +A list +} +\description{ +This function loads precomputed regional functional association data. +} diff --git a/man/load_regional_multivariate_data.Rd b/man/load_regional_multivariate_data.Rd new file mode 100644 index 00000000..00880981 --- /dev/null +++ b/man/load_regional_multivariate_data.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/file_utils.R +\name{load_regional_multivariate_data} +\alias{load_regional_multivariate_data} +\title{Load and Preprocess Regional Multivariate Data} +\usage{ +load_regional_multivariate_data(matrix_y_min_complete = NULL, ...) +} +\value{ +A list +} +\description{ +This function loads regional association data and processes it into a multivariate format. +It optionally filters out samples based on missingness thresholds in the response matrix. +} diff --git a/man/load_regional_regression_data.Rd b/man/load_regional_regression_data.Rd new file mode 100644 index 00000000..f069f59f --- /dev/null +++ b/man/load_regional_regression_data.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/file_utils.R +\name{load_regional_regression_data} +\alias{load_regional_regression_data} +\title{Load Regional Data for Regression Modeling} +\usage{ +load_regional_regression_data(...) +} +\value{ +A list +} +\description{ +This function loads regional association data formatted for regression modeling. +It includes phenotype, genotype, and covariate matrices along with metadata. +} diff --git a/man/load_regional_univariate_data.Rd b/man/load_regional_univariate_data.Rd new file mode 100644 index 00000000..47651c05 --- /dev/null +++ b/man/load_regional_univariate_data.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/file_utils.R +\name{load_regional_univariate_data} +\alias{load_regional_univariate_data} +\title{Load Regional Univariate Association Data} +\usage{ +load_regional_univariate_data(...) +} +\value{ +A list +} +\description{ +This function loads regional association data for univariate analysis. +It includes residual matrices, original genotype data, and additional metadata. +} diff --git a/man/mrash_weights.Rd b/man/mrash_weights.Rd new file mode 100644 index 00000000..9d567905 --- /dev/null +++ b/man/mrash_weights.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/regularized_regression.R +\name{mrash_weights} +\alias{mrash_weights} +\title{Compute Weights Using mr.ash Shrinkage} +\usage{ +mrash_weights(X, y, init_prior_sd = TRUE, ...) +} +\description{ +This function fits the `mr.ash` model (adaptive shrinkage regression) to estimate weights +for a given set of predictors and response. It uses optional prior standard deviation initialization +and can accept custom initial beta values. +} +\examples{ +wgt.mr.ash <- mrash_weights(eqtl$X, eqtl$y_res, beta.init = lasso_weights(X, y)) +} diff --git a/man/parse_cs_corr.Rd b/man/parse_cs_corr.Rd new file mode 100644 index 00000000..93fea320 --- /dev/null +++ b/man/parse_cs_corr.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/univariate_rss_diagnostics.R +\name{parse_cs_corr} +\alias{parse_cs_corr} +\title{Parse Credible Set Correlations from extract_cs_info() Output} +\usage{ +parse_cs_corr(df) +} +\arguments{ +\item{df}{Data frame or data.table. The output from `extract_cs_info()` function, +containing a `cs_corr` column with correlation information.} +} +\value{ +A data.table with the original columns from the input, plus: + \item{cs_corr_1, cs_corr_2, ...}{Individual correlation values, with column names + based on their position in the original string} + \item{cs_corr_max}{Maximum absolute correlation value (excluding 1)} + \item{cs_corr_min}{Minimum absolute correlation value} +} +\description{ +This function takes the output from `extract_cs_info()` and expands the `cs_corr` column +into multiple columns, preserving the original order of correlations. It also +calculates maximum and minimum correlation values for each Credible Set. +} +\details{ +The function splits the `cs_corr` column, which typically contains correlation +values separated by '|', into individual columns. It preserves the order of +these correlations, allowing for easy interpretation in a matrix-like format. +} +\note{ +- This function converts the input to a data.table if it isn't already one. +- It handles cases where correlation values might be missing or not in the expected format. +- The function assumes that correlation values of 1 represent self-correlations and excludes + these when calculating max and min correlations. +} diff --git a/man/rss_analysis_pipeline.Rd b/man/rss_analysis_pipeline.Rd index 2f3b8c4d..4883920a 100644 --- a/man/rss_analysis_pipeline.Rd +++ b/man/rss_analysis_pipeline.Rd @@ -23,7 +23,8 @@ rss_analysis_pipeline( impute_opts = list(rcond = 0.01, R2_threshold = 0.6, minimum_ld = 5, lamb = 0.01), pip_cutoff_to_skip = 0, remove_indels = FALSE, - comment_string = "#" + comment_string = "#", + diagnostics = FALSE ) } \arguments{ diff --git a/man/run_linear_regression.Rd b/man/run_linear_regression.Rd deleted file mode 100644 index 6a37e133..00000000 --- a/man/run_linear_regression.Rd +++ /dev/null @@ -1,21 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/quail_vqtl.R -\name{run_linear_regression} -\alias{run_linear_regression} -\title{Perform Linear Regression for GWAS} -\usage{ -run_linear_regression(genotype, phenotype, covariates = NULL) -} -\arguments{ -\item{genotype}{Numeric matrix of genotypes (n x p), where n is the number of samples and p is the number of SNPs.} - -\item{phenotype}{Numeric vector of phenotypes (length n).} - -\item{covariates}{Optional numeric matrix of covariates (n x k), where k is the number of covariates.} -} -\value{ -A data frame containing regression results for each SNP, including \code{BETA}, \code{SE}, \code{Z}, \code{P}, and \code{Q}. -} -\description{ -Perform Linear Regression for GWAS -} diff --git a/man/univariate_regression.Rd b/man/univariate_regression.Rd deleted file mode 100644 index bc2897be..00000000 --- a/man/univariate_regression.Rd +++ /dev/null @@ -1,34 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/quail_vqtl.R -\name{univariate_regression} -\alias{univariate_regression} -\title{Perform Univariate Regression for Genotype Data} -\usage{ -univariate_regression( - X, - y, - Z = NULL, - center = TRUE, - scale = FALSE, - return_residuals = FALSE -) -} -\arguments{ -\item{X}{Numeric matrix of genotypes (n x p), where n is the number of samples and p is the number of SNPs.} - -\item{y}{Numeric vector of phenotypes (length n).} - -\item{Z}{Optional numeric matrix of covariates (n x k), where k is the number of covariates.} - -\item{center}{Logical, whether to center the data (default: TRUE).} - -\item{scale}{Logical, whether to scale the data (default: FALSE).} - -\item{return_residuals}{Logical, whether to return residuals (default: FALSE).} -} -\value{ -A list containing regression results: \code{betahat}, \code{sebetahat}, \code{z_scores}, \code{p_values}, and \code{q_values}. -} -\description{ -Perform Univariate Regression for Genotype Data -} diff --git a/man/z_to_pvalue.Rd b/man/z_to_pvalue.Rd new file mode 100644 index 00000000..fd47ae03 --- /dev/null +++ b/man/z_to_pvalue.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/misc.R +\name{z_to_pvalue} +\alias{z_to_pvalue} +\title{Convert Z-scores to P-values} +\usage{ +z_to_pvalue(z) +} +\arguments{ +\item{z}{Numeric vector. The z-scores to be converted to p-values.} +} +\value{ +A numeric vector of p-values corresponding to the input z-scores. +} +\description{ +This function calculates p-values from given z-scores using a two-tailed normal distribution. +It supports vector input to process multiple z-scores simultaneously. +} +\details{ +The function uses the following formula to calculate p-values: +p-value = 2 * Φ(-|z|) +Where Φ is the cumulative distribution function of the standard normal distribution. +} +\note{ +This function assumes that the input z-scores are from a two-tailed test and +are normally distributed. It calculates two-sided p-values. +For extremely large absolute z-scores, the resulting p-values may be computed as zero +due to floating-point limitations in R. This occurs when the absolute z-score > 37. +} +\examples{ +z <- c(2.5, -1.8, 3.2, 0.7) +pvalues <- z_to_pvalue(z) +print(pvalues) + +}