diff --git a/.github/recipe/recipe.yaml b/.github/recipe/recipe.yaml index 83bdfe28..e279aac0 100644 --- a/.github/recipe/recipe.yaml +++ b/.github/recipe/recipe.yaml @@ -23,6 +23,7 @@ requirements: - ${{ stdlib('c') }} - ${{ compiler('cxx') }} host: + - bioconductor-biostrings - bioconductor-iranges - bioconductor-qvalue - bioconductor-s4vectors @@ -69,6 +70,7 @@ requirements: - r-vroom - r-xicor run: + - bioconductor-biostrings - bioconductor-iranges - bioconductor-qvalue - bioconductor-s4vectors diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ba638a62..7e6ff12d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -36,7 +36,7 @@ jobs: - name: Create TOML from recipe run: | .github/workflows/create_toml_from_yaml.sh ${GITHUB_WORKSPACE} - mkdir /tmp/pixi + mkdir -p /tmp/pixi mv ${GITHUB_WORKSPACE}/pixi.toml /tmp/pixi - name: Setup pixi @@ -45,7 +45,9 @@ jobs: manifest-path: /tmp/pixi/pixi.toml - name: Run unit tests - run: pixi run --environment ${{ matrix.environment }} --manifest-path /tmp/pixi/pixi.toml devtools_test + run: | + pixi config set --manifest-path /tmp/pixi/pixi.toml --local run-post-link-scripts insecure + pixi run --environment ${{ matrix.environment }} --manifest-path /tmp/pixi/pixi.toml devtools_test - name: Check unit test code coverage if: matrix.platform == 'linux-64' && matrix.environment == 'r44' diff --git a/DESCRIPTION b/DESCRIPTION index 0abe849e..e30cd3b1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -14,6 +14,7 @@ Authors@R: c(person("Gao Wang",role = c("cre","aut"), person("StatFunGen Lab", role = "ctb")) License: MIT + file LICENSE Imports: + Biostrings, IRanges, Rcpp, S4Vectors, diff --git a/NAMESPACE b/NAMESPACE index 1fd2116c..3f7f4994 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -23,9 +23,7 @@ export(coloc_wrapper) export(colocboost_analysis_pipeline) export(compute_LD) export(compute_qtl_enrichment) -export(ctwas_bimfile_loader) export(dentist) -export(dentist_from_files) export(dentist_single_window) export(dpr_weights) export(enet_weights) @@ -39,7 +37,6 @@ export(find_data) export(find_duplicate_variants) export(fsusie_get_cs) export(fsusie_wrapper) -export(get_ctwas_meta_data) export(get_filter_lbf_index) export(get_nested_element) export(get_ref_variant_info) @@ -54,6 +51,7 @@ export(lassosum_rss) export(lassosum_rss_weights) export(lbf_to_alpha) export(ld_loader) +export(ld_mismatch_qc) export(load_LD_matrix) export(load_genotype_region) export(load_multicontext_sumstats) @@ -89,7 +87,6 @@ export(normalize_variant_id) export(otters_association) export(otters_weights) export(parse_cs_corr) -export(parse_dentist_output) export(parse_region) export(parse_variant_id) export(perform_qr_analysis) @@ -99,7 +96,6 @@ export(qr_screen) export(quantile_twas_weight_pipeline) export(raiss) export(read_afreq) -export(read_dentist_sumstat) export(region_to_df) export(rss_analysis_pipeline) export(rss_basic_qc) diff --git a/R/RcppExports.R b/R/RcppExports.R index e434983b..5a0bbd5e 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,24 +1,6 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -#' Compute LD matrix using GCTA-style formula matching the DENTIST binary. -#' -#' This function computes pairwise Pearson correlations from a genotype matrix -#' using the exact same formula and floating-point operation order as the -#' original DENTIST C++ binary (calcLDFromBfile_gcta in bfileOperations.cpp). -#' Key differences from R's crossprod-based approach: -#' 1. Accumulates integer-valued sums sequentially (exact intermediate values) -#' 2. Handles per-pair missing data with GCTA correction formula -#' 3. Divides by nKeptSample (trimmed to multiple of 4), not nrow(X) -#' -#' @param X Numeric genotype matrix (samples x SNPs), values in {0, 1, 2, NA}. -#' Rows should already be trimmed to a multiple of 4. -#' @param ncpus Number of CPU threads for parallel computation. -#' @return Symmetric LD correlation matrix with 1.0 on diagonal. -compute_LD_gcta_cpp <- function(X, ncpus = 1L) { - .Call('_pecotmr_compute_LD_gcta_cpp', PACKAGE = 'pecotmr', X, ncpus) -} - dentist_iterative_impute <- function(LD_mat, nSample, zScore, pValueThreshold, propSVD, gcControl, nIter, gPvalueThreshold, ncpus, correct_chen_et_al_bug, verbose = FALSE) { .Call('_pecotmr_dentist_iterative_impute', PACKAGE = 'pecotmr', LD_mat, nSample, zScore, pValueThreshold, propSVD, gcControl, nIter, gPvalueThreshold, ncpus, correct_chen_et_al_bug, verbose) } @@ -35,7 +17,7 @@ qtl_enrichment_rcpp <- function(r_gwas_pip, r_qtl_susie_fit, pi_gwas = 0, pi_qtl .Call('_pecotmr_qtl_enrichment_rcpp', PACKAGE = 'pecotmr', r_gwas_pip, r_qtl_susie_fit, pi_gwas, pi_qtl, ImpN, shrinkage_lambda, num_threads) } -sdpr_rcpp <- function(bhat, LD, n, per_variant_sample_size = NULL, array = NULL, a = 0.1, c = 1.0, M = 1000L, a0k = 0.5, b0k = 0.5, iter = 1000L, burn = 200L, thin = 5L, n_threads = 1L, opt_llk = 1L, verbose = TRUE) { - .Call('_pecotmr_sdpr_rcpp', PACKAGE = 'pecotmr', bhat, LD, n, per_variant_sample_size, array, a, c, M, a0k, b0k, iter, burn, thin, n_threads, opt_llk, verbose) +sdpr_rcpp <- function(bhat, LD, n, per_variant_sample_size = NULL, array = NULL, a = 0.1, c = 1.0, M = 1000L, a0k = 0.5, b0k = 0.5, iter = 1000L, burn = 200L, thin = 5L, n_threads = 1L, opt_llk = 1L, verbose = TRUE, seed = NULL) { + .Call('_pecotmr_sdpr_rcpp', PACKAGE = 'pecotmr', bhat, LD, n, per_variant_sample_size, array, a, c, M, a0k, b0k, iter, burn, thin, n_threads, opt_llk, verbose, seed) } diff --git a/R/allele_qc.R b/R/allele_qc.R index 0c4f787e..4d5f1c69 100644 --- a/R/allele_qc.R +++ b/R/allele_qc.R @@ -27,27 +27,9 @@ allele_qc <- function(target_data, ref_variants, col_to_flip = NULL, remove_indels = FALSE, remove_strand_ambiguous = TRUE, flip_strand = FALSE, remove_unmatched = TRUE, ...) { strand_flip <- function(ref) { - # Define a mapping for complementary bases - base_mapping <- c("A" = "T", "T" = "A", "G" = "C", "C" = "G") - - # Function to complement a single base - complement_base <- function(base) { - complement <- base_mapping[base] - return(complement) - } - - # Function to reverse and complement a DNA sequence - reverse_complement <- function(sequence) { - reversed <- rev(strsplit(sequence, NULL)[[1]]) - complemented <- sapply(reversed, complement_base, USE.NAMES = FALSE) - return(paste(complemented, collapse = "")) + as.character(Biostrings::reverseComplement(Biostrings::DNAStringSet(ref))) } - complemented_sequence <- sapply(ref, reverse_complement, USE.NAMES = FALSE) - - return(complemented_sequence) - } - # helper to sanitize column names to avoid NA/empty names that break dplyr verbs sanitize_names <- function(df) { nm <- colnames(df) @@ -67,15 +49,7 @@ allele_qc <- function(target_data, ref_variants, col_to_flip = NULL, # check if the pattern is ATCG/DI check_ATCG <- function(vec) { - pattern <- "^[ATCGDI]+$" - - # Function to check if a single element matches the pattern - check_element <- function(element) { - grepl(pattern, element) - } - - result <- sapply(vec, check_element, USE.NAMES = FALSE) - return(result) + grepl("^[ATCGDI]+$", vec) } # transform all inputs to dataframe @@ -100,7 +74,7 @@ allele_qc <- function(target_data, ref_variants, col_to_flip = NULL, if (any(columns_to_remove %in% colnames(target_data))) { target_data <- select(target_data, -any_of(columns_to_remove)) } - match_result <- merge(target_data, ref_variants, by = c("chrom", "pos"), all = FALSE, suffixes = c(".target", ".ref")) %>% as.data.frame() + match_result <- inner_join(target_data, ref_variants, by = c("chrom", "pos"), suffix = c(".target", ".ref")) %>% as.data.frame() # sanitize names after merge as well (merge can introduce empty names in edge cases) match_result <- sanitize_names(match_result) @@ -134,7 +108,7 @@ allele_qc <- function(target_data, ref_variants, col_to_flip = NULL, } # if all strand flip is un-ambigous, then we know ambigous cases are indeed a strand flip # not a sign flip, then we infer there is no ambigous in the whole dataset, and keep those ambigous ones - if (nrow(match_result %>% filter(strand_flip == TRUE) %>% filter(strand_unambiguous == TRUE)) == 0) { + if (!any(match_result$strand_flip & match_result$strand_unambiguous)) { match_result <- match_result %>% mutate(strand_unambiguous = TRUE) } diff --git a/R/ctwas_wrapper.R b/R/ctwas_wrapper.R index 90425819..76d5c7c3 100644 --- a/R/ctwas_wrapper.R +++ b/R/ctwas_wrapper.R @@ -1,35 +1,5 @@ -#' @importFrom vroom vroom -#' @export -ctwas_bimfile_loader <- function(bim_file_path) { - snp_info <- as.data.frame(vroom(bim_file_path, col_names = FALSE)) - if (ncol(snp_info) == 9) { - colnames(snp_info) <- c("chrom", "id", "GD", "pos", "A1", "A2", "variance", "allele_freq", "n_nomiss") - } else { - colnames(snp_info) <- c("chrom", "id", "GD", "pos", "A1", "A2") - } - snp_info$id <- normalize_variant_id(snp_info$id) - return(snp_info) -} - -#' Utility function to format meta data dataframe for cTWAS analyses -#' @importFrom vroom vroom -#' @export -get_ctwas_meta_data <- function(ld_meta_data_file, subset_region_ids = NULL) { - LD_info <- as.data.frame(vroom(ld_meta_data_file)) - colnames(LD_info)[1] <- "chrom" - LD_info$region_id <- paste(as.integer(strip_chr_prefix(LD_info$chrom)), LD_info$start, LD_info$end, sep = "_") - LD_info$LD_file <- paste0(dirname(ld_meta_data_file), "/", gsub(",.*$", "", LD_info$path)) - LD_info$SNP_file <- paste0(LD_info$LD_file, ".bim") - LD_info <- LD_info[, c("region_id", "LD_file", "SNP_file")] - region_info <- LD_info[, "region_id", drop = FALSE] - region_info$chrom <- as.integer(gsub("\\_.*$", "", region_info$region_id)) - region_info$start <- as.integer(gsub("\\_.*$", "", sub("^.*?\\_", "", region_info$region_id))) - region_info$stop <- as.integer(sub("^.*?\\_", "", sub("^.*?\\_", "", region_info$region_id))) - region_info$region_id <- paste0(region_info$chrom, "_", region_info$start, "_", region_info$stop) - region_info <- region_info[, c("chrom", "start", "stop", "region_id")] - if (!is.null(subset_region_ids)) region_info <- region_info[region_info$region_id %in% subset_region_ids, ] - return(list(LD_info = LD_info, region_info = region_info)) -} +### File-I/O functions (ctwas_bimfile_loader, get_ctwas_meta_data) have been +### removed. Use ld_loader() and read_bim() from the standard I/O path instead. #' Function to select variants for ctwas weights input #' @param region_data A list of list containing weights list and snp_info list data for multiple genes/events within a single LD block region. diff --git a/R/dentist_qc.R b/R/dentist_qc.R index 543c92f3..a47f2543 100644 --- a/R/dentist_qc.R +++ b/R/dentist_qc.R @@ -16,7 +16,8 @@ #' \code{nSample} (integer or NULL). #' #' @noRd -resolve_LD_input <- function(R = NULL, X = NULL, nSample = NULL, need_nSample = FALSE) { +resolve_LD_input <- function(R = NULL, X = NULL, nSample = NULL, need_nSample = FALSE, + ld_method = "sample") { if (is.null(R) && is.null(X)) { stop("Either R (LD matrix) or X (genotype matrix) must be provided.") } @@ -26,7 +27,7 @@ resolve_LD_input <- function(R = NULL, X = NULL, nSample = NULL, need_nSample = if (!is.null(X)) { if (!is.matrix(X)) X <- as.matrix(X) if (is.null(nSample)) nSample <- nrow(X) - R <- compute_LD(X, method = "sample") + R <- compute_LD(X, method = ld_method) } if (need_nSample && is.null(nSample)) { stop("nSample is required when providing an LD matrix R.") @@ -67,6 +68,10 @@ resolve_LD_input <- function(R = NULL, X = NULL, nSample = NULL, need_nSample = #' @param correct_chen_et_al_bug Logical indicating whether to correct the Chen et al. bug. Default is TRUE. #' @param min_dim In distance mode: minimum number of SNPs per block (default 2000). #' In count mode: the number of variants per window (i.e., the window size). +#' @param ld_method Character string specifying the LD computation method when +#' \code{X} is provided. Passed to \code{\link{compute_LD}}. One of +#' \code{"sample"} (default), \code{"population"}, or \code{"gcta"}. +#' Ignored when \code{R} is provided directly. #' #' @return A data frame containing the imputed result and detected outliers. #' @@ -111,9 +116,11 @@ dentist <- function(sum_stat, R = NULL, X = NULL, nSample = NULL, window_size = 2000000, window_mode = c("distance", "count"), pValueThreshold = 5.0369e-8, propSVD = 0.4, gcControl = FALSE, nIter = 10, gPvalueThreshold = 0.05, duprThreshold = 0.99, ncpus = 1, - correct_chen_et_al_bug = TRUE, min_dim = 2000) { + correct_chen_et_al_bug = TRUE, min_dim = 2000, + ld_method = "sample") { # Resolve LD matrix and sample size from R or X - resolved <- resolve_LD_input(R = R, X = X, nSample = nSample, need_nSample = TRUE) + resolved <- resolve_LD_input(R = R, X = X, nSample = nSample, need_nSample = TRUE, + ld_method = ld_method) LD_mat <- resolved$R nSample <- resolved$nSample @@ -191,6 +198,10 @@ dentist <- function(sum_stat, R = NULL, X = NULL, nSample = NULL, #' @param duprThreshold Duplicate r-squared threshold. Default is 0.99. #' @param ncpus Number of CPU cores. Default is 1. #' @param correct_chen_et_al_bug Correct the original DENTIST operator! bug. Default is TRUE. +#' @param ld_method Character string specifying the LD computation method when +#' \code{X} is provided. Passed to \code{\link{compute_LD}}. One of +#' \code{"sample"} (default), \code{"population"}, or \code{"gcta"}. +#' Ignored when \code{R} is provided directly. #' #' @return Data frame with columns: original_z, imputed_z, iter_to_correct, rsq, #' is_duplicate, outlier_stat, outlier. @@ -201,9 +212,11 @@ dentist <- function(sum_stat, R = NULL, X = NULL, nSample = NULL, dentist_single_window <- function(zScore, R = NULL, X = NULL, nSample = NULL, pValueThreshold = 5e-8, propSVD = 0.4, gcControl = FALSE, nIter = 10, gPvalueThreshold = 0.05, duprThreshold = 0.99, - ncpus = 1, correct_chen_et_al_bug = TRUE) { + ncpus = 1, correct_chen_et_al_bug = TRUE, + ld_method = "sample") { # Resolve LD matrix and sample size from R or X - LD_mat <- resolve_LD_input(R = R, X = X, nSample = nSample, need_nSample = TRUE) + LD_mat <- resolve_LD_input(R = R, X = X, nSample = nSample, need_nSample = TRUE, + ld_method = ld_method) nSample <- LD_mat$nSample LD_mat <- LD_mat$R @@ -740,355 +753,7 @@ merge_windows <- function(dentist_result_by_window, window_divided_res) { return(merged_results) } -#' Read DENTIST-format Summary Statistics -#' -#' Reads a GWAS summary statistics file in the format expected by the DENTIST C++ binary -#' and returns it as an R data frame with z-scores computed. -#' -#' @param gwas_summary Path to summary statistics file (tab/space-separated, -#' 8 columns: SNP A1 A2 freq beta se p N). May be gzipped. -#' @return A data frame with columns: SNP, A1, A2, freq, beta, se, p, N, z. -#' -#' @export -read_dentist_sumstat <- function(gwas_summary) { - if (!file.exists(gwas_summary)) { - stop(paste0("Summary statistics file not found: ", gwas_summary)) - } - # Handle gzipped files - if (grepl("\\.gz$", gwas_summary)) { - con <- gzfile(gwas_summary, "rt") - on.exit(close(con), add = TRUE) - ss <- read.table(con, header = TRUE, stringsAsFactors = FALSE) - } else { - ss <- read.table(gwas_summary, header = TRUE, stringsAsFactors = FALSE) - } - expected <- c("SNP", "A1", "A2", "freq", "beta", "se", "p", "N") - # Case-insensitive matching - matched <- sapply(expected, function(e) { - idx <- which(tolower(colnames(ss)) == tolower(e)) - if (length(idx) == 0) return(NA_integer_) - idx[1] - }) - if (any(is.na(matched))) { - missing <- expected[is.na(matched)] - stop(paste0("Missing columns in summary file: ", paste(missing, collapse = ", "), - ". Expected: ", paste(expected, collapse = ", "))) - } - ss <- ss[, matched, drop = FALSE] - colnames(ss) <- expected - ss$z <- ss$beta / ss$se - return(ss) -} - -#' Run R DENTIST Implementation on DENTIST-format Input Files -#' -#' Takes the same file-based inputs as the DENTIST C++ binary (summary stats file + -#' PLINK bfile) and runs the R \code{\link{dentist}} implementation. This allows -#' direct comparison between the R implementation and the C++ binary on identical data. -#' -#' This function reuses existing package utilities for file I/O, allele QC, and LD -#' computation: \code{\link{read_bim}} for PLINK bim files, \code{\link{allele_qc}} -#' for allele matching/flipping, \code{\link{load_genotype_region}} for genotype loading, -#' and \code{compute_LD} (from misc.R) for LD matrix computation with mean imputation -#' and Rfast::cora when available. -#' -#' @param gwas_summary Path to the GWAS summary statistics file (DENTIST format: -#' tab-separated, 8 columns with header: SNP A1 A2 freq beta se p N). May be gzipped. -#' @param bfile PLINK binary file prefix (expects .bed/.bim/.fam files). -#' @param nSample Number of samples in the LD reference panel. If NULL (recommended), -#' uses the reference panel size from the genotype matrix. This controls the SVD -#' truncation rank K = min(idx_size, nSample) * propSVD. Note: this is the reference -#' panel size, NOT the GWAS sample size. Default is NULL. -#' @param window_size Window size in base pairs for distance mode. Default is 2000000. -#' @param window_mode Character string specifying the windowing strategy: -#' \code{"distance"} (default) or \code{"count"}. See \code{\link{dentist}}. -#' @param pValueThreshold P-value threshold for outlier detection. Default is 5.0369e-8. -#' @param propSVD SVD truncation proportion. Default is 0.4. -#' @param gcControl Logical; apply genomic control. Default is FALSE. -#' @param nIter Number of QC iterations. Default is 10. -#' @param gPvalueThreshold GWAS p-value threshold for grouping. Default is 0.05. -#' @param duprThreshold LD r-squared threshold for duplicate detection. Default is 0.99. -#' @param ncpus Number of CPU threads. Default is 1. -#' @param correct_chen_et_al_bug Logical; correct known bugs in original DENTIST. Default is TRUE. -#' @param min_dim In distance mode: minimum number of SNPs per block (default 2000). -#' In count mode: the number of variants per window. -#' @param use_gcta_LD Logical; use GCTA-style LD computation and raw B-allele -#' counts to match the DENTIST binary's exact floating-point behavior. Requires -#' snpStats. Default is FALSE; set to TRUE when comparing against the binary. -#' @param verbose Logical; print progress messages. Default is TRUE. -#' -#' @return A list with components: -#' \describe{ -#' \item{result}{Data frame from \code{\link{dentist}} with outlier detection results.} -#' \item{sum_stat}{Aligned summary statistics data frame (with SNP names and positions).} -#' \item{LD_mat}{The LD correlation matrix used.} -#' } -#' -#' @details -#' This function performs the full pipeline: -#' \enumerate{ -#' \item Reads the summary statistics file via \code{\link{read_dentist_sumstat}}. -#' \item Reads the PLINK bim file via \code{read_bim} and matches SNPs by ID -#' to obtain chromosome and position information. -#' \item Aligns alleles using \code{\link{allele_qc}}, which handles strand flips, -#' allele swaps, and z-score sign flipping. -#' \item Loads genotype data via \code{\link{load_genotype_region}} and computes -#' the LD correlation matrix via \code{compute_LD} (mean imputation + Rfast::cora). -#' \item Calls \code{\link{dentist}} with the aligned data. -#' } -#' -#' The result includes the aligned summary statistics and LD matrix so they can be -#' reused for further analysis or debugging. -#' -#' @seealso \code{\link{dentist}}, \code{\link{read_dentist_sumstat}}, -#' \code{\link{allele_qc}}, \code{\link{parse_dentist_output}} -#' -#' @export -dentist_from_files <- function(gwas_summary, - bfile, - nSample = NULL, - window_size = 2000000, - window_mode = c("distance", "count"), - pValueThreshold = 5.0369e-8, - propSVD = 0.4, - gcControl = FALSE, - nIter = 10, - gPvalueThreshold = 0.05, - duprThreshold = 0.99, - ncpus = 1, - correct_chen_et_al_bug = TRUE, - min_dim = 2000, - use_gcta_LD = FALSE, - verbose = TRUE) { - # 1. Read summary stats - if (verbose) message("Reading summary statistics...") - ss <- read_dentist_sumstat(gwas_summary) - if (verbose) message(sprintf(" %d variants read", nrow(ss))) - - # 2. Read bim using existing utility (read_bim expects .bed path) - if (verbose) message("Reading reference panel bim...") - bim <- read_bim(paste0(bfile, ".bed")) - # bim columns: chrom, id, gpos, pos, a1, a0 - if (verbose) message(sprintf(" %d variants in reference panel", nrow(bim))) - - # 3. Match summary stats to bim by SNP ID to get chrom/pos - common_ids <- intersect(ss$SNP, bim$id) - if (length(common_ids) == 0) { - stop("No common SNPs between summary statistics and reference panel.") - } - if (verbose) message(sprintf(" %d SNPs in common", length(common_ids))) - - ss_match <- ss[match(common_ids, ss$SNP), , drop = FALSE] - bim_match <- bim[match(common_ids, bim$id), , drop = FALSE] - - # 4. Allele QC using existing allele_qc() utility - # Build target df: sumstats alleles + chrom/pos from bim + extra columns to carry through - target_df <- data.frame( - chrom = as.integer(bim_match$chrom), - pos = as.integer(bim_match$pos), - A1 = ss_match$A1, - A2 = ss_match$A2, - z = ss_match$z, - SNP = ss_match$SNP, - N = ss_match$N, - bim_id = bim_match$id, - stringsAsFactors = FALSE - ) - - # Build ref df: bim alleles (a1 = effect allele, a0 = other allele) - ref_df <- data.frame( - chrom = as.integer(bim_match$chrom), - pos = as.integer(bim_match$pos), - A1 = bim_match$a1, - A2 = bim_match$a0, - stringsAsFactors = FALSE - ) - - if (verbose) message("Aligning alleles via allele_qc()...") - qc_result <- allele_qc( - target_data = target_df, - ref_variants = ref_df, - col_to_flip = "z", - match_min_prop = 0, - remove_dups = TRUE, - remove_strand_ambiguous = TRUE - ) - aligned <- qc_result$target_data_qced - - if (nrow(aligned) == 0) { - stop("No variants remaining after allele QC.") - } - - # Sort by position - aligned <- aligned[order(aligned$pos), , drop = FALSE] - rownames(aligned) <- NULL - - if (verbose) { - message(sprintf(" %d variants after allele QC (from %d common)", - nrow(aligned), length(common_ids))) - } - - # 6. Load genotypes and compute LD - if (verbose) message("Loading genotype data...") - if (use_gcta_LD) { - # Load raw B-allele counts (as in the .bed file) to match the DENTIST - # binary's exact floating-point behavior via GCTA-style LD computation. - if (!requireNamespace("snpStats", quietly = TRUE)) { - stop("snpStats is required for use_gcta_LD = TRUE") - } - geno <- snpStats::read.plink(bfile) - X <- as(geno$genotypes, "numeric") # B-allele counts, matching DENTIST software - } else { - X <- load_genotype_region(bfile, region = NULL) - } - - # Determine sample size -- use reference panel size (nrow of genotype matrix), - # NOT the GWAS sample size from the summary stats N column. - # The original DENTIST software uses ref.N (number of samples in the .fam file) - # for the K = min(idx.size(), nSample) * propSVD truncation. - if (is.null(nSample)) { - nSample <- as.integer(nrow(X)) - if (verbose) message(sprintf("Using reference panel sample size: N = %d", nSample)) - } - - # Subset to aligned SNPs using bim IDs - snp_ids_for_geno <- aligned$bim_id - missing_snps <- snp_ids_for_geno[!snp_ids_for_geno %in% colnames(X)] - if (length(missing_snps) > 0) { - warning(sprintf("%d aligned SNPs not found in genotype matrix; dropping them.", - length(missing_snps))) - aligned <- aligned[snp_ids_for_geno %in% colnames(X), , drop = FALSE] - snp_ids_for_geno <- aligned$bim_id - } - X_sub <- X[, snp_ids_for_geno, drop = FALSE] - - # Remove zero-variance columns (monomorphic SNPs) to avoid NaN correlations - # Check variance BEFORE handling missing (using non-missing values) - col_vars <- apply(X_sub, 2, function(x) { - vals <- x[!is.na(x)] - length(unique(vals)) > 1 - }) - if (any(!col_vars)) { - n_mono <- sum(!col_vars) - if (verbose) message(sprintf(" Removing %d monomorphic SNPs", n_mono)) - X_sub <- X_sub[, col_vars, drop = FALSE] - aligned <- aligned[col_vars, , drop = FALSE] - snp_ids_for_geno <- aligned$bim_id - } - - n_missing <- sum(is.na(X_sub)) - if (verbose) message(sprintf("Computing LD for %d SNPs from %d samples (%d missing genotypes)...", - ncol(X_sub), nrow(X_sub), n_missing)) - - # Compute LD matrix. - if (use_gcta_LD) { - # Use C++ GCTA-style computation matching the DENTIST software bfileOperations.cpp - # exactly: per-pair missing data handling, sequential sample accumulation, - # same floating-point operation order. - N_kept <- (nrow(X_sub) %/% 4L) * 4L - if (N_kept < nrow(X_sub)) { - X_sub <- X_sub[seq_len(N_kept), , drop = FALSE] - } - LD_mat <- compute_LD_gcta_cpp(X_sub, ncpus = 1L) - colnames(LD_mat) <- rownames(LD_mat) <- colnames(X_sub) - } else { - LD_mat <- compute_LD(X_sub, method = "sample") - } - - # 7. Prepare sum_stat for dentist() -- needs pos and z columns - dentist_input <- data.frame( - SNP = aligned$SNP, - pos = aligned$pos, - z = aligned$z, - stringsAsFactors = FALSE - ) - - # 8. Run dentist - if (verbose) message("Running R DENTIST implementation...") - dentist_result <- dentist( - sum_stat = dentist_input, - R = LD_mat, - nSample = nSample, - window_size = window_size, - window_mode = window_mode, - pValueThreshold = pValueThreshold, - propSVD = propSVD, - gcControl = gcControl, - nIter = nIter, - gPvalueThreshold = gPvalueThreshold, - duprThreshold = duprThreshold, - ncpus = ncpus, - correct_chen_et_al_bug = correct_chen_et_al_bug, - min_dim = min_dim - ) - - # Attach SNP names to result using index_global when available (windowed mode), - # or directly when the result has the same number of rows as the input. - if ("index_global" %in% colnames(dentist_result)) { - dentist_result$SNP <- dentist_input$SNP[dentist_result$index_global] - } else if (nrow(dentist_result) == nrow(dentist_input)) { - dentist_result$SNP <- dentist_input$SNP - } else { - warning("Cannot assign SNP names: row count mismatch and no index_global column") - } - - if (verbose) { - n_outlier <- sum(dentist_result$outlier, na.rm = TRUE) - message(sprintf("Done: %d variants tested, %d outliers detected (%.1f%%)", - nrow(dentist_result), n_outlier, - 100 * n_outlier / nrow(dentist_result))) - } - - return(list( - result = dentist_result, - sum_stat = aligned, - LD_mat = LD_mat - )) -} - -#' Parse DENTIST Binary Output Files -#' -#' Reads the output files produced by the DENTIST C++ software and returns -#' a structured data frame. Useful for comparing DENTIST output against -#' \code{\link{dentist_from_files}} results. -#' -#' @param output_prefix The output prefix used when running the DENTIST software -#' (the \code{--out} argument). -#' @param pValueThreshold P-value threshold used to determine outlier status. -#' Default is 5e-8. -#' @return A data frame with columns: SNP, test_stat, neg_log10_p, is_duplicate, outlier. -#' -#' @details -#' The DENTIST software (non-debug mode) writes a \code{.DENTIST.full.txt} file with -#' 4 tab-separated columns (no header): rsID, stat/lambda, -log10(pvalue), isDuplicate. -#' It also writes a \code{.DENTIST.short.txt} file listing outlier SNP IDs (one per line). -#' -#' @export -parse_dentist_output <- function(output_prefix, pValueThreshold = 5e-8) { - full_file <- paste0(output_prefix, ".DENTIST.full.txt") - short_file <- paste0(output_prefix, ".DENTIST.short.txt") - - if (!file.exists(full_file)) { - stop(paste0("DENTIST full output file not found: ", full_file)) - } - - full_df <- read.table(full_file, header = FALSE, sep = "\t", - stringsAsFactors = FALSE, - col.names = c("SNP", "test_stat", "neg_log10_p", "is_duplicate")) - full_df$is_duplicate <- as.logical(full_df$is_duplicate) - full_df$outlier <- full_df$neg_log10_p > -log10(pValueThreshold) - - # Cross-check with short file if it exists - if (file.exists(short_file) && file.info(short_file)$size > 0) { - short_snps <- trimws(readLines(short_file)) - short_snps <- short_snps[nchar(short_snps) > 0] - n_outlier_full <- sum(full_df$outlier) - n_outlier_short <- length(short_snps) - if (n_outlier_full != n_outlier_short) { - warning(paste0("Outlier count mismatch: full file has ", n_outlier_full, - " outliers, short file has ", n_outlier_short, " outliers.")) - } - } - - return(full_df) -} +### File-I/O functions (dentist_from_files, read_dentist_sumstat, parse_dentist_output) +### have been removed. Use the standard pipeline: load genotypes via +### load_genotype_region(), compute LD via compute_LD(), then call dentist() +### or ld_mismatch_qc() directly. diff --git a/R/ld_loader.R b/R/ld_loader.R index 7b2c4640..21b56fac 100644 --- a/R/ld_loader.R +++ b/R/ld_loader.R @@ -16,7 +16,7 @@ #' Set \code{ld_meta_path} and \code{regions}.} #' \item{LD_info mode}{Loads pre-computed LD blocks from \code{.cor.xz} #' files listed in an \code{LD_info} data.frame (as returned by -#' \code{\link{get_ctwas_meta_data}}). Set \code{LD_info}.} +#' cTWAS meta-data utilities). Set \code{LD_info}.} #' } #' #' @param R_list List of G precomputed LD correlation matrices (p_g x p_g). @@ -30,7 +30,7 @@ #' \code{.cor.xz} LD matrix files) and optionally \code{SNP_file} #' (paths to companion \code{.bim} files; defaults to #' \code{paste0(LD_file, ".bim")} if absent). As returned by -#' \code{\link{get_ctwas_meta_data}}. +#' cTWAS meta-data utilities. #' @param return_genotype Logical. When using region mode, return the #' genotype matrix X (\code{TRUE}) or LD correlation R (\code{FALSE}, #' default). diff --git a/R/misc.R b/R/misc.R index b9fcdd4f..014de82c 100644 --- a/R/misc.R +++ b/R/misc.R @@ -188,7 +188,7 @@ safe_svd <- function(mat, tol = 1e-8, max_rank = NULL) { #' Compute LD (Linkage Disequilibrium) Correlation Matrix from Genotypes #' #' Computes a pairwise Pearson correlation matrix from a genotype matrix. -#' Supports two variance conventions: +#' Supports three variance conventions: #' \describe{ #' \item{\code{"sample"}}{Standard sample variance with N-1 denominator (default). #' Uses mean imputation for missing genotypes, then \code{Rfast::cora} (if available) @@ -198,16 +198,22 @@ safe_svd <- function(mat, tol = 1e-8, max_rank = NULL) { #' from non-missing values; missing entries are set to zero after centering so they #' do not contribute to cross-products. Cross-products are normalized by the total #' sample count N, not by pairwise non-missing counts.} +#' \item{\code{"gcta"}}{GCTA per-pair missing data correction. Like \code{"population"} +#' but applies a correction term for each SNP pair based on the number of jointly +#' non-missing samples. Matches the exact formula from the DENTIST C++ binary's +#' \code{calcLDFromBfile_gcta}. Use this when missingness varies substantially +#' across SNPs and accuracy of individual LD entries matters.} #' } #' #' @param X Numeric genotype matrix (samples x SNPs). May contain \code{NA} #' for missing genotypes. -#' @param method Character, either \code{"sample"} (default, N-1 denominator) or -#' \code{"population"} (N denominator, GCTA-style). Partial matching is supported. -#' @param trim_samples Logical. If \code{TRUE} and \code{method = "population"}, -#' drops trailing samples so that \code{nrow(X)} is a multiple of 4, matching -#' PLINK .bed file chunk processing. Ignored when \code{method = "sample"}. -#' Default is \code{FALSE}. +#' @param method Character, one of \code{"sample"} (default, N-1 denominator), +#' \code{"population"} (N denominator, GCTA-style), or \code{"gcta"} (per-pair +#' missing data correction). Partial matching is supported. +#' @param trim_samples Logical. If \code{TRUE} and \code{method} is +#' \code{"population"} or \code{"gcta"}, drops trailing samples so that +#' \code{nrow(X)} is a multiple of 4, matching PLINK .bed file chunk processing. +#' Ignored when \code{method = "sample"}. Default is \code{FALSE}. #' @param shrinkage Numeric in (0, 1]. Shrink the LD matrix toward the identity: #' \code{R_s = (1 - shrinkage) * R + shrinkage * I}. Useful for regularizing #' LD for summary-statistics-based methods such as lassosum (Mak et al 2017). @@ -243,10 +249,13 @@ safe_svd <- function(mat, tol = 1e-8, max_rank = NULL) { #' #' # GCTA-style population variance #' R2 <- compute_LD(X, method = "population") +#' +#' # GCTA-style with per-pair missing data correction +#' R3 <- compute_LD(X, method = "gcta") #' } #' #' @export -compute_LD <- function(X, method = c("sample", "population"), +compute_LD <- function(X, method = c("sample", "population", "gcta"), trim_samples = FALSE, shrinkage = 0) { if (is.null(X)) { stop("X must be provided.") @@ -269,7 +278,7 @@ compute_LD <- function(X, method = c("sample", "population"), } else { R <- cor(X_imp) } - } else { + } else if (method == "population") { # ---- Population variance (N denominator, GCTA-style) ---- # Optionally trim trailing samples to a multiple of 4 (matches .bed processing) if (trim_samples) { @@ -304,6 +313,58 @@ compute_LD <- function(X, method = c("sample", "population"), # Correlation sd_vec <- sqrt(col_vars) R <- cov_mat / outer(sd_vec, sd_vec) + } else { + # ---- GCTA per-pair missing data correction ---- + # Matches the DENTIST binary's calcLDFromBfile_gcta formula exactly. + # Unlike "population" which divides by total N, this method tracks + # per-pair missing counts and applies a correction term. + if (trim_samples) { + N_kept <- (nrow(X) %/% 4L) * 4L + if (N_kept < nrow(X)) X <- X[seq_len(N_kept), , drop = FALSE] + } + N <- nrow(X) + p <- ncol(X) + + # Marginal statistics from non-missing values + col_means <- colMeans(X, na.rm = TRUE) + col_mean_sq <- colMeans(X^2, na.rm = TRUE) + col_vars <- col_mean_sq - col_means^2 + + # Build indicator matrix for non-missing values + not_na <- !is.na(X) + # Replace NA with 0 for cross-product computation + X_zero <- X + X_zero[is.na(X_zero)] <- 0 + + # Per-pair non-missing counts: not_na'not_na gives count of jointly observed + pair_counts <- crossprod(not_na * 1.0) + n_missing <- N - pair_counts + + # Per-pair sums: sum of X_i over samples where both i and j are observed + # For the correction term we need E_i2 = sum_i_pair / N (pair-specific mean) + # X_zero' %*% not_na gives, for each (i,j), sum of X_i where j is not missing + pair_sums <- crossprod(X_zero, not_na * 1.0) + + # Cross-product sum: sum(X_i * X_j) over jointly non-missing samples + sum_XY <- crossprod(X_zero) + + # GCTA correction formula: + # E_i2[i,j] = pair_sums[i,j] / N (mean of SNP i restricted to non-missing-j samples, divided by N) + # cov = sum_XY/N + E[i]*E[j]*(N-m)/N - E[i]*E_j2 - E_i2*E[j] + E_i2 <- pair_sums / N # p x p: row i, col j = sum of X_i where j non-missing, / N + E_j2 <- t(E_i2) # transposed version + + cov_mat <- sum_XY / N + + outer(col_means, col_means) * (pair_counts / N) - + col_means * E_j2 - + E_i2 * rep(col_means, each = p) + + # Correlation + sd_vec <- sqrt(col_vars) + sd_outer <- outer(sd_vec, sd_vec) + R <- matrix(0.001, p, p) + valid <- sd_outer > 0 + R[valid] <- cov_mat[valid] / sd_outer[valid] } # Ensure clean output @@ -499,13 +560,14 @@ parse_variant_id <- function(ids) { normalized <- gsub("_", ":", ids) normalized <- strip_build_suffix(normalized) - # Split into parts - parts <- strsplit(normalized, ":", fixed = TRUE) - # Truncate any entries with more than 4 parts to just the first 4 - parts <- lapply(parts, function(p) p[1:min(length(p), 4)]) - - data <- data.frame(do.call(rbind, parts), stringsAsFactors = FALSE) - colnames(data) <- c("chrom", "pos", "A2", "A1") + # Split into exactly 4 fields using strcapture (vectorized, no list overhead) + data <- strcapture( + "^([^:]+):([^:]+):([^:]+):([^:]+)", + normalized, + proto = data.frame(chrom = character(), pos = character(), + A2 = character(), A1 = character(), + stringsAsFactors = FALSE) + ) data$chrom <- as.integer(strip_chr_prefix(data$chrom)) data$pos <- as.integer(data$pos) diff --git a/R/raiss.R b/R/raiss.R index af770bae..0b50bb54 100644 --- a/R/raiss.R +++ b/R/raiss.R @@ -210,7 +210,7 @@ raiss_single_matrix_from_X <- function(ref_panel, known_zscores, X, lamb = 0.01, )) } -#' Robust and accurate imputation from summary statistics +#' Impute Summary Statistics Using LD (RAISS) #' #' This function is a part of the statistical library for SNP imputation from: #' https://gitlab.pasteur.fr/statistical-genetics/raiss/-/blob/master/raiss/stat_models.py diff --git a/R/regularized_regression.R b/R/regularized_regression.R index 391ce30f..83a4983a 100644 --- a/R/regularized_regression.R +++ b/R/regularized_regression.R @@ -203,7 +203,7 @@ prs_cs_weights <- function(stat, LD, ...) { #' @export sdpr <- function(bhat, LD, n, per_variant_sample_size = NULL, array = NULL, a = 0.1, c = 1.0, M = 1000, a0k = 0.5, b0k = 0.5, iter = 1000, burn = 200, thin = 5, n_threads = 1, - opt_llk = 1, verbose = TRUE) { + opt_llk = 1, verbose = TRUE, seed = NULL) { # Check if the sum of the rows in LD list is the same as length of bhat if (sum(sapply(LD, nrow)) != length(bhat)) { stop("The sum of the rows in LD list must be the same as the length of bhat.") @@ -232,7 +232,7 @@ sdpr <- function(bhat, LD, n, per_variant_sample_size = NULL, array = NULL, a = # Call the sdpr_rcpp function result <- sdpr_rcpp( bhat, LD, n, per_variant_sample_size, array, a, c, M, a0k, b0k, iter, burn, thin, - n_threads, opt_llk, verbose + n_threads, opt_llk, verbose, seed ) return(result) diff --git a/R/slalom.R b/R/slalom.R index d9000766..24ce5395 100644 --- a/R/slalom.R +++ b/R/slalom.R @@ -21,6 +21,10 @@ #' for prediction. Default is 0.6. #' @param lead_variant_choice Character, method to choose the lead variant, either #' "pvalue" or "abf", with default "pvalue". +#' @param ld_method Character string specifying the LD computation method when +#' \code{X} is provided. Passed to \code{\link{compute_LD}}. One of +#' \code{"sample"} (default), \code{"population"}, or \code{"gcta"}. +#' Ignored when \code{R} is provided directly. #' @return A list containing the annotated LD matrix with ABF results, credible sets, #' lead variant, and DENTIST-S statistics; and a summary dataframe with aggregate statistics. #' @examples @@ -30,9 +34,11 @@ #' slalom <- function(zScore, R = NULL, X = NULL, standard_error = rep(1, length(zScore)), abf_prior_variance = 0.04, nlog10p_dentist_s_threshold = 4.0, - r2_threshold = 0.6, lead_variant_choice = "pvalue") { + r2_threshold = 0.6, lead_variant_choice = "pvalue", + ld_method = "sample") { # Resolve LD matrix from R or X - ld_resolved <- resolve_LD_input(R = R, X = X, need_nSample = FALSE) + ld_resolved <- resolve_LD_input(R = R, X = X, need_nSample = FALSE, + ld_method = ld_method) LD_mat <- ld_resolved$R if (!is.matrix(LD_mat) || nrow(LD_mat) != ncol(LD_mat) || nrow(LD_mat) != length(zScore)) { diff --git a/R/sumstats_qc.R b/R/sumstats_qc.R index 2200634e..7c45b27e 100644 --- a/R/sumstats_qc.R +++ b/R/sumstats_qc.R @@ -82,56 +82,89 @@ rss_basic_qc <- function(sumstats, LD_data, skip_region = NULL, keep_indel = TRU } +#' Detect LD-Summary Statistic Mismatches +#' +#' Unified wrapper for detecting outlier variants due to LD-summary statistic +#' mismatches. Dispatches to either \code{\link{dentist_single_window}} or +#' \code{\link{slalom}} based on the \code{method} argument. +#' +#' @param zScore Numeric vector of z-scores. +#' @param R Square LD correlation matrix. Provide either \code{R} or \code{X}. +#' @param X Genotype matrix (samples x SNPs). If provided, LD is computed via +#' \code{\link{compute_LD}} and \code{nSample} defaults to \code{nrow(X)}. +#' @param nSample Number of samples in the LD reference panel. Required when +#' \code{R} is provided and \code{method = "dentist"}; inferred from \code{X} +#' when \code{X} is provided. +#' @param method Character string specifying the QC method: \code{"slalom"} +#' (default) or \code{"dentist"}. +#' @param ld_method Character string specifying the LD computation method when +#' \code{X} is provided. One of \code{"sample"} (default), \code{"population"}, +#' or \code{"gcta"}. Ignored when \code{R} is provided directly. +#' @param ... Additional arguments passed to the underlying QC method +#' (\code{\link{dentist_single_window}} or \code{\link{slalom}}). +#' +#' @return A data frame with at least a logical \code{outlier} column indicating +#' which variants are identified as outliers. The remaining columns depend on +#' the method used. +#' +#' @seealso \code{\link{dentist_single_window}}, \code{\link{slalom}}, +#' \code{\link{summary_stats_qc}} +#' @importFrom dplyr mutate row_number filter pull +#' @export +ld_mismatch_qc <- function(zScore, R = NULL, X = NULL, nSample = NULL, + method = c("slalom", "dentist"), + ld_method = "sample", ...) { + method <- match.arg(method) + if (method == "dentist") { + qc_results <- dentist_single_window(zScore, R = R, X = X, nSample = nSample, + ld_method = ld_method, ...) + return(qc_results) + } else { + qc_results <- slalom(zScore, R = R, X = X, ld_method = ld_method, ...) + # Standardize output: slalom uses "outliers", rename to "outlier" for consistency + result <- qc_results$data + if ("outliers" %in% colnames(result) && !"outlier" %in% colnames(result)) { + colnames(result)[colnames(result) == "outliers"] <- "outlier" + } + return(result) + } +} + #' Perform Quality Control on Summary Statistics #' #' This function performs quality control on the processed summary statistics using the specified method. +#' It wraps \code{\link{ld_mismatch_qc}} and handles subsetting of the summary statistics and LD matrix. #' #' @param sumstats A data frame containing the processed summary statistics. #' @param LD_data A list containing the combined LD variants data generated by load_LD_matrix. +#' @param n Sample size for the LD reference panel (used by dentist method). #' @param method The quality control method to use. Options are "slalom" or "dentist" (default: "slalom"). #' #' @return A list containing the quality-controlled summary statistics and updated LD matrix. -#' - sumstats_qc: The quality-controlled summary statistics data frame. -#' - LD_mat_qc: The updated LD matrix after quality control. -#' -#' @details This function applies the specified quality control method to the processed summary statistics. -#' -#' The available quality control methods are: -#' - "dentist": Applies the DENTIST quality control procedure (Chen et al 2021). -#' - "slalom": Applies the SLALOM quality control procedure. +#' - sumstats: The quality-controlled summary statistics data frame. +#' - LD_mat: The updated LD matrix after quality control. +#' - outlier_number: The number of outlier variants removed. #' -#' The function returns the quality-controlled summary statistics along with the updated LD matrix. +#' @details This function applies the specified quality control method to the processed summary statistics +#' via \code{\link{ld_mismatch_qc}}, then subsets the summary statistics and LD matrix to keep +#' only non-outlier variants. #' #' @examples #' # Perform SLALOM quality control (default) #' qc_results <- summary_stats_qc(sumstats, LD_data, method = "slalom") #' +#' @importFrom dplyr mutate row_number filter pull #' @export summary_stats_qc <- function(sumstats, LD_data, n = NULL, method = c("slalom", "dentist")) { method <- match.arg(method) - # assuming sumstats has been allele QC-ed, using rss_basic_qc() function LD_extract <- LD_data$LD_matrix[sumstats$variant_id, sumstats$variant_id, drop = FALSE] - if (method == "dentist") { - qc_results <- dentist_single_window(sumstats$z, R = LD_extract, nSample = n, duprThreshold = 0.99) - keep_index <- qc_results %>% - mutate(index = row_number()) %>% - filter(!outlier) %>% - pull(index) - sumstats_qc <- sumstats[keep_index, , drop = FALSE] - LD_mat_qc <- LD_extract[sumstats_qc$variant_id, sumstats_qc$variant_id, drop = FALSE] - outlier_number <- nrow(sumstats) - nrow(sumstats_qc) - } else if (method == "slalom") { - qc_results <- slalom(zScore = sumstats$z, R = LD_extract) - keep_index <- qc_results$data %>% - mutate(index = row_number()) %>% - filter(!outliers) %>% - pull(index) - sumstats_qc <- sumstats[keep_index, , drop = FALSE] - LD_mat_qc <- LD_extract[sumstats_qc$variant_id, sumstats_qc$variant_id, drop = FALSE] - outlier_number <- nrow(sumstats) - nrow(sumstats_qc) - } else { - stop("Invalid quality control method specified. Available methods are: 'dentist', 'slalom'.") - } + + qc_results <- ld_mismatch_qc(zScore = sumstats$z, R = LD_extract, nSample = n, + method = method) + keep_index <- which(!qc_results$outlier) + sumstats_qc <- sumstats[keep_index, , drop = FALSE] + LD_mat_qc <- LD_extract[sumstats_qc$variant_id, sumstats_qc$variant_id, drop = FALSE] + outlier_number <- nrow(sumstats) - nrow(sumstats_qc) return(list(sumstats = sumstats_qc, LD_mat = LD_mat_qc, outlier_number = outlier_number)) } diff --git a/R/univariate_pipeline.R b/R/univariate_pipeline.R index 63029909..5ccb052e 100644 --- a/R/univariate_pipeline.R +++ b/R/univariate_pipeline.R @@ -264,7 +264,7 @@ rss_analysis_pipeline <- function( # Quality control (always uses R) if (!is.null(qc_method)) { - qc_results <- summary_stats_qc(sumstats, LD_data, n = n, var_y = var_y, method = qc_method) + qc_results <- summary_stats_qc(sumstats, LD_data, n = n, method = qc_method) sumstats <- qc_results$sumstats LD_mat <- qc_results$LD_mat } diff --git a/man/compute_LD.Rd b/man/compute_LD.Rd index 3a2c9578..d7b1b5e5 100644 --- a/man/compute_LD.Rd +++ b/man/compute_LD.Rd @@ -6,7 +6,7 @@ \usage{ compute_LD( X, - method = c("sample", "population"), + method = c("sample", "population", "gcta"), trim_samples = FALSE, shrinkage = 0 ) @@ -15,13 +15,14 @@ compute_LD( \item{X}{Numeric genotype matrix (samples x SNPs). May contain \code{NA} for missing genotypes.} -\item{method}{Character, either \code{"sample"} (default, N-1 denominator) or -\code{"population"} (N denominator, GCTA-style). Partial matching is supported.} +\item{method}{Character, one of \code{"sample"} (default, N-1 denominator), +\code{"population"} (N denominator, GCTA-style), or \code{"gcta"} (per-pair +missing data correction). Partial matching is supported.} -\item{trim_samples}{Logical. If \code{TRUE} and \code{method = "population"}, -drops trailing samples so that \code{nrow(X)} is a multiple of 4, matching -PLINK .bed file chunk processing. Ignored when \code{method = "sample"}. -Default is \code{FALSE}.} +\item{trim_samples}{Logical. If \code{TRUE} and \code{method} is +\code{"population"} or \code{"gcta"}, drops trailing samples so that +\code{nrow(X)} is a multiple of 4, matching PLINK .bed file chunk processing. +Ignored when \code{method = "sample"}. Default is \code{FALSE}.} \item{shrinkage}{Numeric in (0, 1]. Shrink the LD matrix toward the identity: \code{R_s = (1 - shrinkage) * R + shrinkage * I}. Useful for regularizing @@ -34,7 +35,7 @@ A symmetric correlation matrix with row and column names taken from } \description{ Computes a pairwise Pearson correlation matrix from a genotype matrix. -Supports two variance conventions: +Supports three variance conventions: \describe{ \item{\code{"sample"}}{Standard sample variance with N-1 denominator (default). Uses mean imputation for missing genotypes, then \code{Rfast::cora} (if available) @@ -44,6 +45,11 @@ Supports two variance conventions: from non-missing values; missing entries are set to zero after centering so they do not contribute to cross-products. Cross-products are normalized by the total sample count N, not by pairwise non-missing counts.} + \item{\code{"gcta"}}{GCTA per-pair missing data correction. Like \code{"population"} + but applies a correction term for each SNP pair based on the number of jointly + non-missing samples. Matches the exact formula from the DENTIST C++ binary's + \code{calcLDFromBfile_gcta}. Use this when missingness varies substantially + across SNPs and accuracy of individual LD entries matters.} } } \details{ @@ -73,6 +79,9 @@ R1 <- compute_LD(X) # GCTA-style population variance R2 <- compute_LD(X, method = "population") + +# GCTA-style with per-pair missing data correction +R3 <- compute_LD(X, method = "gcta") } } diff --git a/man/compute_LD_gcta_cpp.Rd b/man/compute_LD_gcta_cpp.Rd deleted file mode 100644 index 4193c0dd..00000000 --- a/man/compute_LD_gcta_cpp.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/RcppExports.R -\name{compute_LD_gcta_cpp} -\alias{compute_LD_gcta_cpp} -\title{Compute LD matrix using GCTA-style formula matching the DENTIST binary.} -\usage{ -compute_LD_gcta_cpp(X, ncpus = 1L) -} -\arguments{ -\item{X}{Numeric genotype matrix (samples x SNPs), values in {0, 1, 2, NA}. -Rows should already be trimmed to a multiple of 4.} - -\item{ncpus}{Number of CPU threads for parallel computation.} -} -\value{ -Symmetric LD correlation matrix with 1.0 on diagonal. -} -\description{ -This function computes pairwise Pearson correlations from a genotype matrix -using the exact same formula and floating-point operation order as the -original DENTIST C++ binary (calcLDFromBfile_gcta in bfileOperations.cpp). -Key differences from R's crossprod-based approach: - 1. Accumulates integer-valued sums sequentially (exact intermediate values) - 2. Handles per-pair missing data with GCTA correction formula - 3. Divides by nKeptSample (trimmed to multiple of 4), not nrow(X) -} diff --git a/man/dentist.Rd b/man/dentist.Rd index 3b99609d..b16735d2 100644 --- a/man/dentist.Rd +++ b/man/dentist.Rd @@ -19,7 +19,8 @@ dentist( duprThreshold = 0.99, ncpus = 1, correct_chen_et_al_bug = TRUE, - min_dim = 2000 + min_dim = 2000, + ld_method = "sample" ) } \arguments{ @@ -62,6 +63,11 @@ in distance mode (base pairs). Default is 2000000 (2 Mb). Only used when \item{min_dim}{In distance mode: minimum number of SNPs per block (default 2000). In count mode: the number of variants per window (i.e., the window size).} + +\item{ld_method}{Character string specifying the LD computation method when +\code{X} is provided. Passed to \code{\link{compute_LD}}. One of +\code{"sample"} (default), \code{"population"}, or \code{"gcta"}. +Ignored when \code{R} is provided directly.} } \value{ A data frame containing the imputed result and detected outliers. diff --git a/man/dentist_from_files.Rd b/man/dentist_from_files.Rd deleted file mode 100644 index 7095d6bc..00000000 --- a/man/dentist_from_files.Rd +++ /dev/null @@ -1,106 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dentist_qc.R -\name{dentist_from_files} -\alias{dentist_from_files} -\title{Run R DENTIST Implementation on DENTIST-format Input Files} -\usage{ -dentist_from_files( - gwas_summary, - bfile, - nSample = NULL, - window_size = 2e+06, - window_mode = c("distance", "count"), - pValueThreshold = 5.0369e-08, - propSVD = 0.4, - gcControl = FALSE, - nIter = 10, - gPvalueThreshold = 0.05, - duprThreshold = 0.99, - ncpus = 1, - correct_chen_et_al_bug = TRUE, - min_dim = 2000, - use_gcta_LD = FALSE, - verbose = TRUE -) -} -\arguments{ -\item{gwas_summary}{Path to the GWAS summary statistics file (DENTIST format: -tab-separated, 8 columns with header: SNP A1 A2 freq beta se p N). May be gzipped.} - -\item{bfile}{PLINK binary file prefix (expects .bed/.bim/.fam files).} - -\item{nSample}{Number of samples in the LD reference panel. If NULL (recommended), -uses the reference panel size from the genotype matrix. This controls the SVD -truncation rank K = min(idx_size, nSample) * propSVD. Note: this is the reference -panel size, NOT the GWAS sample size. Default is NULL.} - -\item{window_size}{Window size in base pairs for distance mode. Default is 2000000.} - -\item{window_mode}{Character string specifying the windowing strategy: -\code{"distance"} (default) or \code{"count"}. See \code{\link{dentist}}.} - -\item{pValueThreshold}{P-value threshold for outlier detection. Default is 5.0369e-8.} - -\item{propSVD}{SVD truncation proportion. Default is 0.4.} - -\item{gcControl}{Logical; apply genomic control. Default is FALSE.} - -\item{nIter}{Number of QC iterations. Default is 10.} - -\item{gPvalueThreshold}{GWAS p-value threshold for grouping. Default is 0.05.} - -\item{duprThreshold}{LD r-squared threshold for duplicate detection. Default is 0.99.} - -\item{ncpus}{Number of CPU threads. Default is 1.} - -\item{correct_chen_et_al_bug}{Logical; correct known bugs in original DENTIST. Default is TRUE.} - -\item{min_dim}{In distance mode: minimum number of SNPs per block (default 2000). -In count mode: the number of variants per window.} - -\item{use_gcta_LD}{Logical; use GCTA-style LD computation and raw B-allele -counts to match the DENTIST binary's exact floating-point behavior. Requires -snpStats. Default is FALSE; set to TRUE when comparing against the binary.} - -\item{verbose}{Logical; print progress messages. Default is TRUE.} -} -\value{ -A list with components: - \describe{ - \item{result}{Data frame from \code{\link{dentist}} with outlier detection results.} - \item{sum_stat}{Aligned summary statistics data frame (with SNP names and positions).} - \item{LD_mat}{The LD correlation matrix used.} - } -} -\description{ -Takes the same file-based inputs as the DENTIST C++ binary (summary stats file + -PLINK bfile) and runs the R \code{\link{dentist}} implementation. This allows -direct comparison between the R implementation and the C++ binary on identical data. -} -\details{ -This function reuses existing package utilities for file I/O, allele QC, and LD -computation: \code{\link{read_bim}} for PLINK bim files, \code{\link{allele_qc}} -for allele matching/flipping, \code{\link{load_genotype_region}} for genotype loading, -and \code{compute_LD} (from misc.R) for LD matrix computation with mean imputation -and Rfast::cora when available. - - -This function performs the full pipeline: -\enumerate{ - \item Reads the summary statistics file via \code{\link{read_dentist_sumstat}}. - \item Reads the PLINK bim file via \code{read_bim} and matches SNPs by ID - to obtain chromosome and position information. - \item Aligns alleles using \code{\link{allele_qc}}, which handles strand flips, - allele swaps, and z-score sign flipping. - \item Loads genotype data via \code{\link{load_genotype_region}} and computes - the LD correlation matrix via \code{compute_LD} (mean imputation + Rfast::cora). - \item Calls \code{\link{dentist}} with the aligned data. -} - -The result includes the aligned summary statistics and LD matrix so they can be -reused for further analysis or debugging. -} -\seealso{ -\code{\link{dentist}}, \code{\link{read_dentist_sumstat}}, - \code{\link{allele_qc}}, \code{\link{parse_dentist_output}} -} diff --git a/man/dentist_single_window.Rd b/man/dentist_single_window.Rd index 184f0c02..417da12c 100644 --- a/man/dentist_single_window.Rd +++ b/man/dentist_single_window.Rd @@ -16,7 +16,8 @@ dentist_single_window( gPvalueThreshold = 0.05, duprThreshold = 0.99, ncpus = 1, - correct_chen_et_al_bug = TRUE + correct_chen_et_al_bug = TRUE, + ld_method = "sample" ) } \arguments{ @@ -46,6 +47,11 @@ inferred from \code{X} when \code{X} is provided.} \item{ncpus}{Number of CPU cores. Default is 1.} \item{correct_chen_et_al_bug}{Correct the original DENTIST operator! bug. Default is TRUE.} + +\item{ld_method}{Character string specifying the LD computation method when +\code{X} is provided. Passed to \code{\link{compute_LD}}. One of +\code{"sample"} (default), \code{"population"}, or \code{"gcta"}. +Ignored when \code{R} is provided directly.} } \value{ Data frame with columns: original_z, imputed_z, iter_to_correct, rsq, diff --git a/man/get_ctwas_meta_data.Rd b/man/get_ctwas_meta_data.Rd deleted file mode 100644 index 1d83ce00..00000000 --- a/man/get_ctwas_meta_data.Rd +++ /dev/null @@ -1,11 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ctwas_wrapper.R -\name{get_ctwas_meta_data} -\alias{get_ctwas_meta_data} -\title{Utility function to format meta data dataframe for cTWAS analyses} -\usage{ -get_ctwas_meta_data(ld_meta_data_file, subset_region_ids = NULL) -} -\description{ -Utility function to format meta data dataframe for cTWAS analyses -} diff --git a/man/ld_loader.Rd b/man/ld_loader.Rd index 03955fb5..61b02b56 100644 --- a/man/ld_loader.Rd +++ b/man/ld_loader.Rd @@ -30,7 +30,7 @@ is used.} \code{.cor.xz} LD matrix files) and optionally \code{SNP_file} (paths to companion \code{.bim} files; defaults to \code{paste0(LD_file, ".bim")} if absent). As returned by -\code{\link{get_ctwas_meta_data}}.} +cTWAS meta-data utilities.} \item{return_genotype}{Logical. When using region mode, return the genotype matrix X (\code{TRUE}) or LD correlation R (\code{FALSE}, @@ -61,7 +61,7 @@ Four modes are supported: Set \code{ld_meta_path} and \code{regions}.} \item{LD_info mode}{Loads pre-computed LD blocks from \code{.cor.xz} files listed in an \code{LD_info} data.frame (as returned by - \code{\link{get_ctwas_meta_data}}). Set \code{LD_info}.} + cTWAS meta-data utilities). Set \code{LD_info}.} } } \examples{ diff --git a/man/ld_mismatch_qc.Rd b/man/ld_mismatch_qc.Rd new file mode 100644 index 00000000..441fec0b --- /dev/null +++ b/man/ld_mismatch_qc.Rd @@ -0,0 +1,52 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sumstats_qc.R +\name{ld_mismatch_qc} +\alias{ld_mismatch_qc} +\title{Detect LD-Summary Statistic Mismatches} +\usage{ +ld_mismatch_qc( + zScore, + R = NULL, + X = NULL, + nSample = NULL, + method = c("slalom", "dentist"), + ld_method = "sample", + ... +) +} +\arguments{ +\item{zScore}{Numeric vector of z-scores.} + +\item{R}{Square LD correlation matrix. Provide either \code{R} or \code{X}.} + +\item{X}{Genotype matrix (samples x SNPs). If provided, LD is computed via +\code{\link{compute_LD}} and \code{nSample} defaults to \code{nrow(X)}.} + +\item{nSample}{Number of samples in the LD reference panel. Required when +\code{R} is provided and \code{method = "dentist"}; inferred from \code{X} +when \code{X} is provided.} + +\item{method}{Character string specifying the QC method: \code{"slalom"} +(default) or \code{"dentist"}.} + +\item{ld_method}{Character string specifying the LD computation method when +\code{X} is provided. One of \code{"sample"} (default), \code{"population"}, +or \code{"gcta"}. Ignored when \code{R} is provided directly.} + +\item{...}{Additional arguments passed to the underlying QC method +(\code{\link{dentist_single_window}} or \code{\link{slalom}}).} +} +\value{ +A data frame with at least a logical \code{outlier} column indicating + which variants are identified as outliers. The remaining columns depend on + the method used. +} +\description{ +Unified wrapper for detecting outlier variants due to LD-summary statistic +mismatches. Dispatches to either \code{\link{dentist_single_window}} or +\code{\link{slalom}} based on the \code{method} argument. +} +\seealso{ +\code{\link{dentist_single_window}}, \code{\link{slalom}}, + \code{\link{summary_stats_qc}} +} diff --git a/man/parse_dentist_output.Rd b/man/parse_dentist_output.Rd deleted file mode 100644 index a8520fe2..00000000 --- a/man/parse_dentist_output.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dentist_qc.R -\name{parse_dentist_output} -\alias{parse_dentist_output} -\title{Parse DENTIST Binary Output Files} -\usage{ -parse_dentist_output(output_prefix, pValueThreshold = 5e-08) -} -\arguments{ -\item{output_prefix}{The output prefix used when running the DENTIST software -(the \code{--out} argument).} - -\item{pValueThreshold}{P-value threshold used to determine outlier status. -Default is 5e-8.} -} -\value{ -A data frame with columns: SNP, test_stat, neg_log10_p, is_duplicate, outlier. -} -\description{ -Reads the output files produced by the DENTIST C++ software and returns -a structured data frame. Useful for comparing DENTIST output against -\code{\link{dentist_from_files}} results. -} -\details{ -The DENTIST software (non-debug mode) writes a \code{.DENTIST.full.txt} file with -4 tab-separated columns (no header): rsID, stat/lambda, -log10(pvalue), isDuplicate. -It also writes a \code{.DENTIST.short.txt} file listing outlier SNP IDs (one per line). -} diff --git a/man/raiss.Rd b/man/raiss.Rd index 95b5d521..d8495efd 100644 --- a/man/raiss.Rd +++ b/man/raiss.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/raiss.R \name{raiss} \alias{raiss} -\title{Robust and accurate imputation from summary statistics} +\title{Impute Summary Statistics Using LD (RAISS)} \usage{ raiss( ref_panel, diff --git a/man/read_dentist_sumstat.Rd b/man/read_dentist_sumstat.Rd deleted file mode 100644 index 81443d3e..00000000 --- a/man/read_dentist_sumstat.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dentist_qc.R -\name{read_dentist_sumstat} -\alias{read_dentist_sumstat} -\title{Read DENTIST-format Summary Statistics} -\usage{ -read_dentist_sumstat(gwas_summary) -} -\arguments{ -\item{gwas_summary}{Path to summary statistics file (tab/space-separated, -8 columns: SNP A1 A2 freq beta se p N). May be gzipped.} -} -\value{ -A data frame with columns: SNP, A1, A2, freq, beta, se, p, N, z. -} -\description{ -Reads a GWAS summary statistics file in the format expected by the DENTIST C++ binary -and returns it as an R data frame with z-scores computed. -} diff --git a/man/sdpr.Rd b/man/sdpr.Rd index d722d8ca..28577cd6 100644 --- a/man/sdpr.Rd +++ b/man/sdpr.Rd @@ -20,7 +20,8 @@ sdpr( thin = 5, n_threads = 1, opt_llk = 1, - verbose = TRUE + verbose = TRUE, + seed = NULL ) } \arguments{ diff --git a/man/slalom.Rd b/man/slalom.Rd index 14b324cc..b804aa02 100644 --- a/man/slalom.Rd +++ b/man/slalom.Rd @@ -12,7 +12,8 @@ slalom( abf_prior_variance = 0.04, nlog10p_dentist_s_threshold = 4, r2_threshold = 0.6, - lead_variant_choice = "pvalue" + lead_variant_choice = "pvalue", + ld_method = "sample" ) } \arguments{ @@ -37,6 +38,11 @@ for prediction. Default is 0.6.} \item{lead_variant_choice}{Character, method to choose the lead variant, either "pvalue" or "abf", with default "pvalue".} + +\item{ld_method}{Character string specifying the LD computation method when +\code{X} is provided. Passed to \code{\link{compute_LD}}. One of +\code{"sample"} (default), \code{"population"}, or \code{"gcta"}. +Ignored when \code{R} is provided directly.} } \value{ A list containing the annotated LD matrix with ABF results, credible sets, diff --git a/man/summary_stats_qc.Rd b/man/summary_stats_qc.Rd index 1a023459..1aa065da 100644 --- a/man/summary_stats_qc.Rd +++ b/man/summary_stats_qc.Rd @@ -11,24 +11,24 @@ summary_stats_qc(sumstats, LD_data, n = NULL, method = c("slalom", "dentist")) \item{LD_data}{A list containing the combined LD variants data generated by load_LD_matrix.} +\item{n}{Sample size for the LD reference panel (used by dentist method).} + \item{method}{The quality control method to use. Options are "slalom" or "dentist" (default: "slalom").} } \value{ A list containing the quality-controlled summary statistics and updated LD matrix. - - sumstats_qc: The quality-controlled summary statistics data frame. - - LD_mat_qc: The updated LD matrix after quality control. + - sumstats: The quality-controlled summary statistics data frame. + - LD_mat: The updated LD matrix after quality control. + - outlier_number: The number of outlier variants removed. } \description{ This function performs quality control on the processed summary statistics using the specified method. +It wraps \code{\link{ld_mismatch_qc}} and handles subsetting of the summary statistics and LD matrix. } \details{ -This function applies the specified quality control method to the processed summary statistics. - - The available quality control methods are: - - "dentist": Applies the DENTIST quality control procedure (Chen et al 2021). - - "slalom": Applies the SLALOM quality control procedure. - - The function returns the quality-controlled summary statistics along with the updated LD matrix. +This function applies the specified quality control method to the processed summary statistics + via \code{\link{ld_mismatch_qc}}, then subsets the summary statistics and LD matrix to keep + only non-outlier variants. } \examples{ # Perform SLALOM quality control (default) diff --git a/pixi.toml b/pixi.toml index 2b67a576..b78eff33 100644 --- a/pixi.toml +++ b/pixi.toml @@ -41,6 +41,7 @@ r45 = {features = ["r45"]} "r-rcmdcheck" = "*" "r-testthat" = "*" "r-tidyverse" = "*" +"bioconductor-biostrings" = "*" "bioconductor-iranges" = "*" "bioconductor-qvalue" = "*" "bioconductor-s4vectors" = "*" diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index ce21e098..ee17c49a 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -11,18 +11,6 @@ Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif -// compute_LD_gcta_cpp -arma::mat compute_LD_gcta_cpp(const Rcpp::NumericMatrix& X, int ncpus); -RcppExport SEXP _pecotmr_compute_LD_gcta_cpp(SEXP XSEXP, SEXP ncpusSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< const Rcpp::NumericMatrix& >::type X(XSEXP); - Rcpp::traits::input_parameter< int >::type ncpus(ncpusSEXP); - rcpp_result_gen = Rcpp::wrap(compute_LD_gcta_cpp(X, ncpus)); - return rcpp_result_gen; -END_RCPP -} // dentist_iterative_impute List dentist_iterative_impute(const arma::mat& LD_mat, size_t nSample, const arma::vec& zScore, double pValueThreshold, float propSVD, bool gcControl, int nIter, double gPvalueThreshold, int ncpus, bool correct_chen_et_al_bug, bool verbose); RcppExport SEXP _pecotmr_dentist_iterative_impute(SEXP LD_matSEXP, SEXP nSampleSEXP, SEXP zScoreSEXP, SEXP pValueThresholdSEXP, SEXP propSVDSEXP, SEXP gcControlSEXP, SEXP nIterSEXP, SEXP gPvalueThresholdSEXP, SEXP ncpusSEXP, SEXP correct_chen_et_al_bugSEXP, SEXP verboseSEXP) { @@ -99,8 +87,8 @@ BEGIN_RCPP END_RCPP } // sdpr_rcpp -Rcpp::List sdpr_rcpp(const std::vector& bhat, const Rcpp::List& LD, int n, Rcpp::Nullable per_variant_sample_size, Rcpp::Nullable array, double a, double c, size_t M, double a0k, double b0k, int iter, int burn, int thin, unsigned n_threads, int opt_llk, bool verbose); -RcppExport SEXP _pecotmr_sdpr_rcpp(SEXP bhatSEXP, SEXP LDSEXP, SEXP nSEXP, SEXP per_variant_sample_sizeSEXP, SEXP arraySEXP, SEXP aSEXP, SEXP cSEXP, SEXP MSEXP, SEXP a0kSEXP, SEXP b0kSEXP, SEXP iterSEXP, SEXP burnSEXP, SEXP thinSEXP, SEXP n_threadsSEXP, SEXP opt_llkSEXP, SEXP verboseSEXP) { +Rcpp::List sdpr_rcpp(const std::vector& bhat, const Rcpp::List& LD, int n, Rcpp::Nullable per_variant_sample_size, Rcpp::Nullable array, double a, double c, size_t M, double a0k, double b0k, int iter, int burn, int thin, unsigned n_threads, int opt_llk, bool verbose, Rcpp::Nullable seed); +RcppExport SEXP _pecotmr_sdpr_rcpp(SEXP bhatSEXP, SEXP LDSEXP, SEXP nSEXP, SEXP per_variant_sample_sizeSEXP, SEXP arraySEXP, SEXP aSEXP, SEXP cSEXP, SEXP MSEXP, SEXP a0kSEXP, SEXP b0kSEXP, SEXP iterSEXP, SEXP burnSEXP, SEXP thinSEXP, SEXP n_threadsSEXP, SEXP opt_llkSEXP, SEXP verboseSEXP, SEXP seedSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -120,18 +108,18 @@ BEGIN_RCPP Rcpp::traits::input_parameter< unsigned >::type n_threads(n_threadsSEXP); Rcpp::traits::input_parameter< int >::type opt_llk(opt_llkSEXP); Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); - rcpp_result_gen = Rcpp::wrap(sdpr_rcpp(bhat, LD, n, per_variant_sample_size, array, a, c, M, a0k, b0k, iter, burn, thin, n_threads, opt_llk, verbose)); + Rcpp::traits::input_parameter< Rcpp::Nullable >::type seed(seedSEXP); + rcpp_result_gen = Rcpp::wrap(sdpr_rcpp(bhat, LD, n, per_variant_sample_size, array, a, c, M, a0k, b0k, iter, burn, thin, n_threads, opt_llk, verbose, seed)); return rcpp_result_gen; END_RCPP } static const R_CallMethodDef CallEntries[] = { - {"_pecotmr_compute_LD_gcta_cpp", (DL_FUNC) &_pecotmr_compute_LD_gcta_cpp, 2}, {"_pecotmr_dentist_iterative_impute", (DL_FUNC) &_pecotmr_dentist_iterative_impute, 11}, {"_pecotmr_lassosum_rss_rcpp", (DL_FUNC) &_pecotmr_lassosum_rss_rcpp, 5}, {"_pecotmr_prs_cs_rcpp", (DL_FUNC) &_pecotmr_prs_cs_rcpp, 12}, {"_pecotmr_qtl_enrichment_rcpp", (DL_FUNC) &_pecotmr_qtl_enrichment_rcpp, 7}, - {"_pecotmr_sdpr_rcpp", (DL_FUNC) &_pecotmr_sdpr_rcpp, 16}, + {"_pecotmr_sdpr_rcpp", (DL_FUNC) &_pecotmr_sdpr_rcpp, 17}, {NULL, NULL, 0} }; diff --git a/src/dentist_iterative_impute.cpp b/src/dentist_iterative_impute.cpp index 1144eb64..a3478594 100644 --- a/src/dentist_iterative_impute.cpp +++ b/src/dentist_iterative_impute.cpp @@ -17,109 +17,6 @@ using namespace Rcpp; using namespace arma; -//' Compute LD matrix using GCTA-style formula matching the DENTIST binary. -//' -//' This function computes pairwise Pearson correlations from a genotype matrix -//' using the exact same formula and floating-point operation order as the -//' original DENTIST C++ binary (calcLDFromBfile_gcta in bfileOperations.cpp). -//' Key differences from R's crossprod-based approach: -//' 1. Accumulates integer-valued sums sequentially (exact intermediate values) -//' 2. Handles per-pair missing data with GCTA correction formula -//' 3. Divides by nKeptSample (trimmed to multiple of 4), not nrow(X) -//' -//' @param X Numeric genotype matrix (samples x SNPs), values in {0, 1, 2, NA}. -//' Rows should already be trimmed to a multiple of 4. -//' @param ncpus Number of CPU threads for parallel computation. -//' @return Symmetric LD correlation matrix with 1.0 on diagonal. -// [[Rcpp::export]] -arma::mat compute_LD_gcta_cpp(const Rcpp::NumericMatrix& X, int ncpus = 1) { - int n = X.nrow(); - int p = X.ncol(); - - int nProcessors = omp_get_max_threads(); - if (ncpus < nProcessors) nProcessors = ncpus; - omp_set_num_threads(nProcessors); - - int nKeptSample = n; // Already trimmed by caller - - // Step 1: Marginal statistics (matching binary lines 145-168) - // E[i] = sum / (nKeptSample - nMissing_marginal) - // E_sq[i] = (sum + 2*sum11) / (nKeptSample - nMissing_marginal) - // VAR[i] = E_sq[i] - E[i]^2 - std::vector E(p), E_sq(p), VAR(p); - - for (int i = 0; i < p; i++) { - double sum_i = 0.0; - double sum_sq_i = 0.0; - int nMissing = 0; - for (int k = 0; k < n; k++) { - double val = X(k, i); - if (ISNAN(val)) { - nMissing++; - } else { - sum_i += val; - sum_sq_i += val * val; - } - } - int denom = nKeptSample - nMissing; - E[i] = sum_i / denom; - E_sq[i] = sum_sq_i / denom; - VAR[i] = E_sq[i] - E[i] * E[i]; - } - - // Step 2: Pairwise LD (matching binary lines 171-251) - arma::mat LD(p, p); - -#pragma omp parallel for schedule(dynamic) - for (int i = 0; i < p; i++) { - LD(i, i) = 1.0; - for (int j = i + 1; j < p; j++) { - int nMissing = 0; - // Use long double for sum_XY to match binary's calcLDFromBfile_gcta - // (bfileOperations.cpp line 195). This ensures the covariance formula - // is evaluated in extended precision, matching binary exactly. - long double sum_XY = 0; - double sum_i_pair = 0.0; - double sum_j_pair = 0.0; - - for (int k = 0; k < n; k++) { - double vi = X(k, i); - double vj = X(k, j); - if (ISNAN(vi) || ISNAN(vj)) { - nMissing++; - } else { - sum_XY += vi * vj; - sum_i_pair += vi; - sum_j_pair += vj; - } - } - - // GCTA formula (binary bfileOperations.cpp lines 216-232): - // E_i2 = sum_i_pair / nKeptSample - // cov = sum_XY/N + E_i*E_j*(N-m)/N - E_i*E_j2 - E_i2*E_j - // The binary computes this with long double sum_XY, so the division - // sum_XY/nKeptSample is in long double, which promotes the entire - // expression to long double before storing as double cov_XY. - double E_i2 = sum_i_pair / nKeptSample; - double E_j2 = sum_j_pair / nKeptSample; - double cov_XY = sum_XY / nKeptSample - + E[i] * E[j] * (nKeptSample - nMissing) / (double)nKeptSample - - E[i] * E_j2 - - E_i2 * E[j]; - - double ld_val = 0.001; - if (!(VAR[i] <= 0 || VAR[j] <= 0 || VAR[i] * VAR[j] <= 0)) { - ld_val = cov_XY / std::sqrt(VAR[i] * VAR[j]); - } - - LD(i, j) = ld_val; - LD(j, i) = ld_val; - } - } - - return LD; -} - // DENTIST RNG: uses srand/rand + sort-by-random-values to produce a permutation. // This faithfully reproduces the original DENTIST C++ binary's random partitioning. // diff --git a/src/sdpr.cpp b/src/sdpr.cpp index 3deb5e3a..3670d5e9 100644 --- a/src/sdpr.cpp +++ b/src/sdpr.cpp @@ -1,4 +1,5 @@ #include +#include #include #include "sdpr_mcmc.h" @@ -20,7 +21,8 @@ Rcpp::List sdpr_rcpp( int thin = 5, unsigned n_threads = 1, int opt_llk = 1, - bool verbose = true + bool verbose = true, + Rcpp::Nullable seed = R_NilValue ) { // Convert Rcpp::List to std::vector std::vector ref_ld_mat; @@ -42,12 +44,20 @@ Rcpp::List sdpr_rcpp( arr = std::vector(bhat.size(), 1); } + // Resolve seed + unsigned int seed_val = 0; + if (seed.isNotNull()) { + seed_val = Rcpp::as(seed); + } else { + seed_val = std::random_device{}(); + } + // Create mcmc_data object mcmc_data data(bhat, ref_ld_mat, sz, arr); // Call the mcmc function std::unordered_map results = mcmc( - data, n, a, c, M, a0k, b0k, iter, burn, thin, n_threads, opt_llk, verbose + data, n, a, c, M, a0k, b0k, iter, burn, thin, n_threads, opt_llk, verbose, seed_val ); // Convert results to Rcpp::List diff --git a/src/sdpr_mcmc.cpp b/src/sdpr_mcmc.cpp index e3656569..f1d07202 100644 --- a/src/sdpr_mcmc.cpp +++ b/src/sdpr_mcmc.cpp @@ -508,14 +508,15 @@ std::unordered_map mcmc( int thin, unsigned n_threads, int opt_llk, - bool verbose + bool verbose, + unsigned int seed ) { int n_pst = (iter - burn) / thin; ldmat_data ldmat_dat; - MCMC_state state(data.beta_mrg.size(), M, a0k, b0k, sz); + MCMC_state state(data.beta_mrg.size(), M, a0k, b0k, sz, seed); // Deflation correction for (size_t i = 0; i < data.beta_mrg.size(); i++) { diff --git a/src/sdpr_mcmc.h b/src/sdpr_mcmc.h index 9de9e533..1c723354 100644 --- a/src/sdpr_mcmc.h +++ b/src/sdpr_mcmc.h @@ -126,7 +126,7 @@ std::vector cluster_var; std::vector suff_stats; std::vector sumsq; MCMC_state(size_t num_snp, size_t max_cluster, \ - double a0, double b0, double sz) { + double a0, double b0, double sz, unsigned int seed) { a0k = a0; b0k = b0; N = sz; // Changed May 20 2021 // Now N (sz) is absorbed into A, B; so set to 1. @@ -155,8 +155,7 @@ MCMC_state(size_t num_snp, size_t max_cluster, \ // Starting from null is standard MCMC practice and lets the sampler // discover causal assignments organically. cls_assgn.assign(num_snp, 0); - std::random_device rd; - r.seed(rd()); + r.seed(seed); } void sample_sigma2(); @@ -255,5 +254,6 @@ std::unordered_map mcmc( int thin, unsigned n_threads, int opt_llk, - bool verbose + bool verbose, + unsigned int seed ); diff --git a/tests/testthat/test_allele_qc.R b/tests/testthat/test_allele_qc.R index 74b53e18..6b761402 100644 --- a/tests/testthat/test_allele_qc.R +++ b/tests/testthat/test_allele_qc.R @@ -357,3 +357,71 @@ test_that("align_variant_names strips build suffix", { result <- align_variant_names(source, reference, remove_build_suffix = TRUE) expect_length(result$aligned_variants, 1) }) + +# ---- sanitize_names edge cases (allele_qc.R lines 37, 42) ---- +test_that("allele_qc handles data frame with NULL colnames after merge", { + # Create a data frame where merge might produce empty names + # by giving target_data a column with NA name + target <- data.frame(chrom = 1, pos = 100, A2 = "A", A1 = "G") + ref <- data.frame(chrom = 1, pos = 100, A2 = "A", A1 = "G") + colnames(target)[1] <- "" + colnames(target) <- make.unique(colnames(target), sep = "_") + # Restore chrom for the join + colnames(target)[1] <- "chrom" + result <- allele_qc(target, ref, match_min_prop = 0) + expect_equal(nrow(result$target_data_qced), 1) +}) + +# ---- target_data with redundant columns (allele_qc.R line 75) ---- +test_that("allele_qc removes redundant columns from target_data before join", { + target <- data.frame( + chrom = 1, pos = 100, A2 = "A", A1 = "G", + variant_id = "1:100:A:G", chromosome = "chr1", position = 100 + ) + ref <- data.frame(chrom = 1, pos = 100, A2 = "A", A1 = "G") + result <- allele_qc(target, ref, match_min_prop = 0) + expect_equal(nrow(result$target_data_qced), 1) + # The redundant columns should have been removed before the join + expect_true("variant_id" %in% colnames(result$target_data_qced)) +}) + +# ---- col_to_flip with nonexistent column (allele_qc.R line 130) ---- +test_that("allele_qc errors when col_to_flip column does not exist", { + target <- data.frame(chrom = 1, pos = 100, A2 = "G", A1 = "A") + ref <- data.frame(chrom = 1, pos = 100, A2 = "A", A1 = "G") + expect_error( + allele_qc(target, ref, col_to_flip = "nonexistent_col", match_min_prop = 0), + "not found in target_data" + ) +}) + +# ---- duplicate removal warning (allele_qc.R lines 148-150) ---- +test_that("allele_qc warns and removes duplicate variants", { + # Two target rows at the same position will produce duplicates after join + target <- data.frame( + chrom = c(1, 1), pos = c(100, 100), + A2 = c("A", "A"), A1 = c("G", "G"), + beta = c(0.5, 0.6) + ) + ref <- data.frame(chrom = 1, pos = 100, A2 = "A", A1 = "G") + expect_warning( + result <- allele_qc(target, ref, match_min_prop = 0, remove_dups = TRUE), + "duplicate variant" + ) + expect_equal(nrow(result$target_data_qced), 1) +}) + +# ---- duplicated variant IDs error (allele_qc.R line 180) ---- +test_that("allele_qc errors on duplicated variant IDs with different values when remove_dups is FALSE", { + # Two rows at same position, same alleles, but different beta — when not removing dups + target <- data.frame( + chrom = c(1, 1), pos = c(100, 100), + A2 = c("A", "A"), A1 = c("G", "G"), + beta = c(0.5, 0.6) + ) + ref <- data.frame(chrom = 1, pos = 100, A2 = "A", A1 = "G") + expect_error( + allele_qc(target, ref, match_min_prop = 0, remove_dups = FALSE), + "Duplicated variants" + ) +}) diff --git a/tests/testthat/test_compute_qtl_enrichment.R b/tests/testthat/test_compute_qtl_enrichment.R index c85606ec..72360594 100644 --- a/tests/testthat/test_compute_qtl_enrichment.R +++ b/tests/testthat/test_compute_qtl_enrichment.R @@ -49,4 +49,103 @@ test_that("compute_qtl_enrichment dummy data single thread and multi-threaded ar res_single <- expect_warning(compute_qtl_enrichment(input_data$gwas_fit$pip, input_data$susie_fits, num_gwas=5000, pi_qtl=0.49819, lambda = 1, ImpN = 10, num_threads = 1)) res_multi <- expect_warning(compute_qtl_enrichment(input_data$gwas_fit$pip, input_data$susie_fits, num_gwas=5000, pi_qtl=0.49819, lambda = 1, ImpN = 10, num_threads = 2)) expect_equal(res_single, res_multi) +}) + +# ---- error paths (compute_qtl_enrichment.R lines 86, 87, 91) ---- +test_that("compute_qtl_enrichment errors when pi_gwas is zero", { + gwas_pip <- rep(0, 10) + names(gwas_pip) <- paste0("snp", 1:10) + susie_fits <- list(fit1 = list(pip = setNames(runif(10), paste0("snp", 1:10)), + alpha = matrix(1, 1, 10), + prior_variance = 1)) + expect_error( + compute_qtl_enrichment(gwas_pip, susie_fits, pi_qtl = 0.5), + "No association signal found in GWAS data" + ) +}) + +test_that("compute_qtl_enrichment errors when pi_qtl is zero", { + gwas_pip <- runif(10) + names(gwas_pip) <- paste0("snp", 1:10) + susie_fits <- list(fit1 = list(pip = setNames(rep(0, 10), paste0("snp", 1:10)), + alpha = matrix(1, 1, 10), + prior_variance = 1)) + expect_error( + suppressWarnings(compute_qtl_enrichment(gwas_pip, susie_fits, num_gwas = 1000, pi_qtl = 0)), + "No QTL associated" + ) +}) + +test_that("compute_qtl_enrichment errors when gwas_pip has no names", { + gwas_pip <- runif(10) # no names + susie_fits <- list(fit1 = list(pip = setNames(runif(10), paste0("snp", 1:10)), + alpha = matrix(1, 1, 10), + prior_variance = 1)) + expect_error( + suppressWarnings(compute_qtl_enrichment(gwas_pip, susie_fits, num_gwas = 1000, pi_qtl = 0.5)), + "Variant names are missing in gwas_pip" + ) +}) + +# ---- real C++ qtl_enrichment_rcpp integration test ---- +test_that("compute_qtl_enrichment calls real C++ enrichment code and returns expected keys", { + set.seed(42) + n_snps <- 50 + variant_names <- paste0("1:", 1:n_snps, ":A:G") + + # GWAS PIPs: sparse signal + gwas_pip <- rep(0.01, n_snps) + gwas_pip[c(5, 20, 35)] <- c(0.8, 0.6, 0.9) + names(gwas_pip) <- variant_names + + # SuSiE fit with 2 single effects over same variants + L <- 2 + alpha <- matrix(1 / n_snps, nrow = L, ncol = n_snps) + # Concentrate probability on causal variants + alpha[1, ] <- 0.001; alpha[1, 5] <- 0.95; alpha[1, ] <- alpha[1, ] / sum(alpha[1, ]) + alpha[2, ] <- 0.001; alpha[2, 20] <- 0.95; alpha[2, ] <- alpha[2, ] / sum(alpha[2, ]) + pip <- colSums(alpha) + names(pip) <- variant_names + + susie_fits <- list( + fit1 = list(pip = pip, alpha = alpha, prior_variance = c(0.5, 0.3)) + ) + + # Call without mocking — exercises the real C++ code + res <- suppressWarnings( + compute_qtl_enrichment(gwas_pip, susie_fits, + num_gwas = 5000, pi_qtl = 0.5, + lambda = 1, ImpN = 5, num_threads = 1) + ) + expect_type(res, "list") + # The enrichment results are in res[[1]] (the C++ output list) + en <- res[[1]] + expected_keys <- c("Intercept", "Enrichment (no shrinkage)", "Enrichment (w/ shrinkage)", + "sd (no shrinkage)", "sd (w/ shrinkage)", + "Alternative (coloc) p1", "Alternative (coloc) p2", "Alternative (coloc) p12") + for (key in expected_keys) { + expect_true(key %in% names(en), info = paste("Missing key:", key)) + } + # All numeric and finite + numeric_vals <- unlist(en[expected_keys]) + expect_true(all(is.finite(numeric_vals))) +}) + +# ---- unmatched variants tracking (compute_qtl_enrichment.R line 102) ---- +test_that("compute_qtl_enrichment tracks unmatched QTL variants", { + local_mocked_bindings( + qtl_enrichment_rcpp = function(...) TRUE + ) + gwas_pip <- runif(10) + names(gwas_pip) <- paste0("1:", 1:10, ":A:G") + # QTL has some variants not in GWAS + qtl_pip <- runif(5) + names(qtl_pip) <- c(paste0("1:", 1:3, ":A:G"), "1:999:A:G", "1:998:A:G") + susie_fits <- list(fit1 = list(pip = qtl_pip, + alpha = matrix(runif(5), 1, 5), + prior_variance = 1)) + res <- suppressWarnings( + compute_qtl_enrichment(gwas_pip, susie_fits, num_gwas = 1000, pi_qtl = 0.5) + ) + expect_true("unused_xqtl_variants" %in% names(res)) }) \ No newline at end of file diff --git a/tests/testthat/test_dentist_qc.R b/tests/testthat/test_dentist_qc.R index 1c5b3577..11e1c8f0 100644 --- a/tests/testthat/test_dentist_qc.R +++ b/tests/testthat/test_dentist_qc.R @@ -147,6 +147,35 @@ test_that("dentist_single_window with X matrix input returns exactly N rows", { expect_equal(nrow(res), n_snps) }) +test_that("dentist_single_window with correct_chen_et_al_bug = FALSE returns N rows", { + data <- generate_dentist_single_window_data() + expect_warning(res <- dentist_single_window( + data$z_scores, R = data$LD_mat, nSample = data$nSample, + correct_chen_et_al_bug = FALSE + )) + expect_equal(nrow(res), 100) + expect_true(all(c("original_z", "imputed_z", "outlier") %in% colnames(res))) +}) + +test_that("dentist_single_window with gcControl = TRUE returns N rows", { + data <- generate_dentist_single_window_data() + expect_warning(res <- dentist_single_window( + data$z_scores, R = data$LD_mat, nSample = data$nSample, + gcControl = TRUE + )) + expect_equal(nrow(res), 100) + expect_true(all(c("original_z", "imputed_z", "outlier") %in% colnames(res))) +}) + +test_that("dentist with gcControl = TRUE returns N rows", { + data <- generate_dentist_data(n_snps = 100) + expect_warning(res <- dentist( + data$sumstat, R = data$LD_mat, nSample = data$nSample, + gcControl = TRUE + )) + expect_equal(nrow(res), 100) +}) + test_that("dentist_single_window dedup path with message for duplicates", { set.seed(42) n_snps <- 80 @@ -562,113 +591,6 @@ test_that("resolve_LD_input uses explicit nSample when X provided", { expect_equal(result$nSample, 999) }) -# =========================================================================== -# read_dentist_sumstat (file-based) -# =========================================================================== - -test_that("read_dentist_sumstat errors when file does not exist", { - expect_error( - read_dentist_sumstat("/nonexistent/file.txt"), - "not found" - ) -}) - -test_that("read_dentist_sumstat reads a valid file and computes z = beta/se", { - tmp <- tempfile(fileext = ".txt") - on.exit(unlink(tmp), add = TRUE) - df <- data.frame( - SNP = paste0("rs", 1:3), - A1 = c("A", "C", "G"), - A2 = c("T", "G", "A"), - freq = c(0.1, 0.2, 0.3), - beta = c(0.5, -0.2, 0.8), - se = c(0.1, 0.05, 0.2), - p = c(1e-5, 0.01, 1e-4), - N = c(1000, 1000, 1000) - ) - write.table(df, tmp, row.names = FALSE, quote = FALSE, sep = "\t") - result <- read_dentist_sumstat(tmp) - expect_equal(nrow(result), 3) - expect_equal(result$z, df$beta / df$se) - expect_true(all(c("SNP", "A1", "A2", "freq", "beta", "se", "p", "N", "z") %in% colnames(result))) -}) - -test_that("read_dentist_sumstat reads gzipped files", { - tmp <- tempfile(fileext = ".txt.gz") - on.exit(unlink(tmp), add = TRUE) - df <- data.frame( - SNP = paste0("rs", 1:3), - A1 = c("A", "C", "G"), - A2 = c("T", "G", "A"), - freq = c(0.1, 0.2, 0.3), - beta = c(0.5, -0.2, 0.8), - se = c(0.1, 0.05, 0.2), - p = c(1e-5, 0.01, 1e-4), - N = c(1000, 1000, 1000) - ) - con <- gzfile(tmp, "wt") - write.table(df, con, row.names = FALSE, quote = FALSE, sep = "\t") - close(con) - result <- read_dentist_sumstat(tmp) - expect_equal(nrow(result), 3) - expect_equal(result$z, df$beta / df$se) -}) - -test_that("read_dentist_sumstat errors on missing columns", { - tmp <- tempfile(fileext = ".txt") - on.exit(unlink(tmp), add = TRUE) - df <- data.frame(SNP = "rs1", A1 = "A", A2 = "T") - write.table(df, tmp, row.names = FALSE, quote = FALSE, sep = "\t") - expect_error(read_dentist_sumstat(tmp), "Missing columns") -}) - -# =========================================================================== -# parse_dentist_output (file-based) -# =========================================================================== - -test_that("parse_dentist_output errors when file is missing", { - expect_error( - parse_dentist_output("/nonexistent/prefix"), - "not found" - ) -}) - -test_that("parse_dentist_output reads valid full file", { - prefix <- tempfile() - full_file <- paste0(prefix, ".DENTIST.full.txt") - on.exit(unlink(c(full_file)), add = TRUE) - df <- data.frame( - V1 = c("rs1", "rs2", "rs3"), - V2 = c(1.5, 25.0, 0.3), - V3 = c(3.0, 10.0, 0.1), - V4 = c(0, 1, 0) - ) - write.table(df, full_file, row.names = FALSE, col.names = FALSE, sep = "\t", quote = FALSE) - result <- parse_dentist_output(prefix, pValueThreshold = 5e-8) - expect_equal(nrow(result), 3) - expect_true("outlier" %in% colnames(result)) - expect_true("is_duplicate" %in% colnames(result)) - expect_true(is.logical(result$is_duplicate)) -}) - -test_that("parse_dentist_output cross-checks short file and warns on mismatch", { - prefix <- tempfile() - full_file <- paste0(prefix, ".DENTIST.full.txt") - short_file <- paste0(prefix, ".DENTIST.short.txt") - on.exit(unlink(c(full_file, short_file)), add = TRUE) - df <- data.frame( - V1 = c("rs1", "rs2"), - V2 = c(1.5, 25.0), - V3 = c(3.0, 10.0), - V4 = c(0, 0) - ) - write.table(df, full_file, row.names = FALSE, col.names = FALSE, sep = "\t", quote = FALSE) - writeLines(c("rs2", "rs3"), short_file) - expect_warning( - parse_dentist_output(prefix, pValueThreshold = 5e-8), - "mismatch" - ) -}) # =========================================================================== # build_segment_result (internal) @@ -717,49 +639,3 @@ test_that("sliding_window_loop errors on infinite loop", { ) }) -# =========================================================================== -# dentist_from_files (mocked I/O paths) -# =========================================================================== - -test_that("dentist_from_files stops when no common SNPs", { - local_mocked_bindings( - read_dentist_sumstat = function(gwas_summary) { - data.frame(SNP = c("rs1", "rs2"), A1 = c("A", "C"), A2 = c("T", "G"), - freq = c(0.1, 0.2), beta = c(0.5, -0.2), se = c(0.1, 0.05), - p = c(1e-5, 0.01), N = c(1000, 1000), z = c(5, -4), - stringsAsFactors = FALSE) - }, - read_bim = function(path) { - data.frame(chrom = c(1, 1), id = c("rs99", "rs100"), - gpos = c(0, 0), pos = c(100, 200), - a1 = c("A", "C"), a0 = c("T", "G"), - stringsAsFactors = FALSE) - } - ) - expect_error( - dentist_from_files("fake_gwas.txt", "fake_bfile"), - "No common SNPs" - ) -}) - -test_that("dentist_from_files stops when no variants remain after allele QC", { - local_mocked_bindings( - read_dentist_sumstat = function(gwas_summary) { - data.frame(SNP = "rs1", A1 = "A", A2 = "T", - freq = 0.1, beta = 0.5, se = 0.1, - p = 1e-5, N = 1000, z = 5, - stringsAsFactors = FALSE) - }, - read_bim = function(path) { - data.frame(chrom = 1, id = "rs1", gpos = 0, pos = 100, - a1 = "A", a0 = "T", stringsAsFactors = FALSE) - }, - allele_qc = function(...) { - list(target_data_qced = data.frame()) - } - ) - expect_error( - dentist_from_files("fake_gwas.txt", "fake_bfile"), - "No variants remaining" - ) -}) diff --git a/tests/testthat/test_encoloc.R b/tests/testthat/test_encoloc.R index 63b427a0..b61c526a 100644 --- a/tests/testthat/test_encoloc.R +++ b/tests/testthat/test_encoloc.R @@ -846,9 +846,122 @@ test_that("process_coloc_results filters impure credible sets", { } }) -# =========================================================================== -# Commented out tests from original file (kept as-is) -# =========================================================================== +# ---- extract_ld_for_variants (encoloc.R lines 117-123) ---- +test_that("extract_ld_for_variants loads LD, aligns names, and subsets", { + # Mock load_LD_matrix to return a small LD matrix with variant names + variants <- c("chr1:10:A:G", "chr1:20:A:G", "chr1:30:A:G") + ld_variants <- c("chr1:10:A:G", "chr1:20:A:G", "chr1:30:A:G", "chr1:40:A:G") + ld_mat <- diag(4) + ld_mat[1, 2] <- ld_mat[2, 1] <- 0.5 + colnames(ld_mat) <- rownames(ld_mat) <- ld_variants + + local_mocked_bindings( + load_LD_matrix = function(meta_file, region) { + list(LD_matrix = ld_mat, LD_variants = ld_variants) + } + ) + + result <- extract_ld_for_variants("dummy_meta.txt", "chr1:1-50", variants) + expect_equal(nrow(result), 3) + expect_equal(ncol(result), 3) + expect_equal(colnames(result), variants) +}) + +# ---- coloc_wrapper with xqtl_finemapping_obj = NULL (encoloc.R line 282) ---- +test_that("coloc_wrapper passes through xqtl_raw_data when xqtl_finemapping_obj is NULL", { + data(coloc_test_data) + attach(coloc_test_data) + on.exit(detach(coloc_test_data)) + + gwas_file <- tempfile(fileext = ".rds") + xqtl_file <- tempfile(fileext = ".rds") + on.exit(file.remove(gwas_file, xqtl_file), add = TRUE) + + p <- 50 + variant_names <- paste0("chr1:", 1:p, ":A:G") + + # GWAS file with standard susie structure + gwas_data <- list(list( + lbf_variable = matrix(rnorm(5 * p), nrow = 5), + V = rep(1, 5), + variant_names = variant_names + )) + saveRDS(gwas_data, gwas_file) + + # xQTL file WITHOUT nesting — xqtl_finemapping_obj = NULL means use raw_data directly + xqtl_data <- list(list( + lbf_variable = matrix(rnorm(3 * p), nrow = 3), + V = rep(1, 3), + variant_names = variant_names + )) + saveRDS(xqtl_data, xqtl_file) + + local_mocked_bindings( + coloc.bf_bf = function(...) { + list(summary = data.frame(PP.H4.abf = 0.9), results = data.frame()) + }, + .package = "coloc" + ) + + result <- coloc_wrapper( + xqtl_file, gwas_file, + xqtl_finemapping_obj = NULL, + xqtl_varname_obj = c("variant_names"), + gwas_varname_obj = c("variant_names") + ) + expect_true(is.list(result)) +}) + +# ---- coloc_wrapper with fsusie fallback (encoloc.R lines 242-243, 288-289) ---- +test_that("coloc_wrapper falls back to fsusie structure when lbf_variable is empty", { + gwas_file <- tempfile(fileext = ".rds") + xqtl_file <- tempfile(fileext = ".rds") + on.exit(file.remove(gwas_file, xqtl_file), add = TRUE) + + p <- 20 + variant_names <- paste0("chr1:", 1:p, ":A:G") + + # GWAS: readRDS(file)[[1]] = raw_data, raw_data has lbf_variable (empty), + # and raw_data[[1]]$fsusie_result$lBF for the fallback path + fsusie_inner <- list(fsusie_result = list(lBF = list(rnorm(p), rnorm(p)))) + gwas_raw <- list( + fsusie_inner, + lbf_variable = data.frame(), + V = numeric(0), + variant_names = variant_names + ) + saveRDS(list(gwas_raw), gwas_file) + + # xQTL: readRDS(file)[[1]] = xqtl_raw_data, + # xqtl_raw_data[[1]]$fsusie_result$lBF for the fallback path + xqtl_raw <- list( + list(fsusie_result = list(lBF = list(rnorm(p), rnorm(p)))), + susie_fit = list( + lbf_variable = data.frame(), + V = numeric(0), + variant_names = variant_names + ) + ) + saveRDS(list(xqtl_raw), xqtl_file) + + local_mocked_bindings( + coloc.bf_bf = function(...) { + list(summary = data.frame(PP.H4.abf = 0.8), results = data.frame()) + }, + .package = "coloc" + ) + + expect_message( + result <- coloc_wrapper( + xqtl_file, gwas_file, + xqtl_finemapping_obj = c("susie_fit"), + xqtl_varname_obj = c("susie_fit", "variant_names"), + gwas_varname_obj = c("variant_names") + ), + "fSuSiE" + ) + expect_true(is.list(result)) +}) # test_that("load_and_extract_ld_matrix works with dummy data", { # data(coloc_test_data) diff --git a/tests/testthat/test_misc.R b/tests/testthat/test_misc.R index 16961a36..a794c857 100644 --- a/tests/testthat/test_misc.R +++ b/tests/testthat/test_misc.R @@ -879,6 +879,51 @@ test_that("compute_LD sample vs population differ but are close", { expect_true(max(abs(R_sample - R_pop)) < 0.1) }) +test_that("compute_LD gcta method produces valid correlation matrix", { + set.seed(42) + X <- matrix(sample(0:2, 500, replace = TRUE), nrow = 100) + colnames(X) <- paste0("rs", 1:5) + + R <- compute_LD(X, method = "gcta") + expect_equal(dim(R), c(5, 5)) + expect_equal(unname(diag(R)), rep(1, 5), tolerance = 1e-10) + expect_true(isSymmetric(R, tol = 1e-10)) + expect_true(all(abs(R) <= 1 + 1e-10)) +}) + +test_that("compute_LD gcta method handles missing data", { + set.seed(42) + X <- matrix(sample(0:2, 500, replace = TRUE), nrow = 100) + colnames(X) <- paste0("rs", 1:5) + X[sample(length(X), 50)] <- NA + + R <- compute_LD(X, method = "gcta") + expect_equal(dim(R), c(5, 5)) + expect_true(all(is.finite(R))) +}) + +test_that("compute_LD gcta agrees with sample method on complete data", { + set.seed(42) + X <- matrix(sample(0:2, 500, replace = TRUE), nrow = 100) + colnames(X) <- paste0("rs", 1:5) + + R_sample <- compute_LD(X, method = "sample") + R_gcta <- compute_LD(X, method = "gcta") + + # With no missing data, GCTA and sample should be close (differ by N vs N-1 denom) + expect_true(max(abs(R_sample - R_gcta)) < 0.05) +}) + +test_that("compute_LD gcta preserves column names", { + set.seed(42) + X <- matrix(sample(0:2, 300, replace = TRUE), nrow = 100) + colnames(X) <- c("snp_a", "snp_b", "snp_c") + + R <- compute_LD(X, method = "gcta") + expect_equal(colnames(R), c("snp_a", "snp_b", "snp_c")) + expect_equal(rownames(R), c("snp_a", "snp_b", "snp_c")) +}) + # ============================================================================= # filter_X_with_Y # ============================================================================= diff --git a/tests/testthat/test_quail_ctwas.R b/tests/testthat/test_quail_ctwas.R index afd8fef9..7a66c55f 100644 --- a/tests/testthat/test_quail_ctwas.R +++ b/tests/testthat/test_quail_ctwas.R @@ -218,152 +218,6 @@ test_that("QUAIL_rank_score_pipeline full run with IVW method", { # ctwas wrapper tests # =========================================================================== -# ---------- ctwas_bimfile_loader ------------------------------------------- - -test_that("ctwas_bimfile_loader loads 6-column BIM file correctly", { - skip_if_not_installed("vroom") - tmp <- tempfile(fileext = ".bim") - on.exit(unlink(tmp), add = TRUE) - # Standard 6-column BIM: chrom, id, GD, pos, alt, ref - bim_lines <- c( - "chr1\tchr1:100:A:G\t0\t100\tA\tG", - "chr1\tchr1:200:C:T\t0\t200\tC\tT", - "chr1\tchr1:300:G:A\t0\t300\tG\tA" - ) - writeLines(bim_lines, tmp) - result <- ctwas_bimfile_loader(tmp) - expect_true(is.data.frame(result)) - expect_equal(nrow(result), 3) - expect_equal(ncol(result), 6) - expect_equal(colnames(result), c("chrom", "id", "GD", "pos", "A1", "A2")) - # Positions should be preserved - expect_equal(result$pos, c(100, 200, 300)) -}) - -test_that("ctwas_bimfile_loader loads 9-column BIM file correctly", { - skip_if_not_installed("vroom") - tmp <- tempfile(fileext = ".bim") - on.exit(unlink(tmp), add = TRUE) - # Extended 9-column BIM with variance, allele_freq, n_nomiss - bim_lines <- c( - "chr1\tchr1:100:A:G\t0\t100\tA\tG\t0.05\t0.3\t1000", - "chr1\tchr1:200:C:T\t0\t200\tC\tT\t0.04\t0.2\t1000" - ) - writeLines(bim_lines, tmp) - result <- ctwas_bimfile_loader(tmp) - expect_true(is.data.frame(result)) - expect_equal(nrow(result), 2) - expect_equal(ncol(result), 9) - expect_equal( - colnames(result), - c("chrom", "id", "GD", "pos", "A1", "A2", "variance", "allele_freq", "n_nomiss") - ) -}) - -test_that("ctwas_bimfile_loader normalizes variant IDs", { - skip_if_not_installed("vroom") - tmp <- tempfile(fileext = ".bim") - on.exit(unlink(tmp), add = TRUE) - # IDs without chr prefix -- normalize_variant_id should add chr prefix - bim_lines <- c( - "1\t1:100:A:G\t0\t100\tA\tG", - "1\t1:200:C:T\t0\t200\tC\tT" - ) - writeLines(bim_lines, tmp) - result <- ctwas_bimfile_loader(tmp) - # normalize_variant_id adds chr prefix by default - expect_true(all(grepl("^chr", result$id))) - # Also verify full normalized format: chr::: - expect_match(result$id[1], "^chr\\d+:\\d+:[ACGT]+:[ACGT]+$") -}) - -# ---------- get_ctwas_meta_data -------------------------------------------- - -test_that("get_ctwas_meta_data returns correct structure", { - skip_if_not_installed("vroom") - tmp <- tempfile(fileext = ".tsv") - on.exit(unlink(tmp), add = TRUE) - # Minimal meta file: chrom, start, end, path - meta_lines <- c( - "chrom\tstart\tend\tpath", - "chr1\t1000\t2000\tLD_block_1.chr1_1000_2000.float16.txt.xz,LD_block_1.chr1_1000_2000.float16.bim", - "chr1\t2000\t3000\tLD_block_2.chr1_2000_3000.float16.txt.xz,LD_block_2.chr1_2000_3000.float16.bim", - "chr2\t5000\t6000\tLD_block_3.chr2_5000_6000.float16.txt.xz,LD_block_3.chr2_5000_6000.float16.bim" - ) - writeLines(meta_lines, tmp) - result <- get_ctwas_meta_data(tmp) - expect_true(is.list(result)) - expect_true("LD_info" %in% names(result)) - expect_true("region_info" %in% names(result)) - # LD_info should have 3 rows and 3 columns - expect_equal(nrow(result$LD_info), 3) - expect_equal(colnames(result$LD_info), c("region_id", "LD_file", "SNP_file")) - # region_info should have 3 rows and 4 columns - expect_equal(nrow(result$region_info), 3) - expect_equal(colnames(result$region_info), c("chrom", "start", "stop", "region_id")) -}) - -test_that("get_ctwas_meta_data constructs region_id correctly", { - skip_if_not_installed("vroom") - tmp <- tempfile(fileext = ".tsv") - on.exit(unlink(tmp), add = TRUE) - meta_lines <- c( - "chrom\tstart\tend\tpath", - "chr1\t1000\t2000\tblock1.txt.xz,block1.bim" - ) - writeLines(meta_lines, tmp) - result <- get_ctwas_meta_data(tmp) - # region_id = paste(chrom_int, start, end) => "1_1000_2000" - expect_equal(result$region_info$region_id, "1_1000_2000") - expect_equal(result$region_info$chrom, 1L) - expect_equal(result$region_info$start, 1000L) - expect_equal(result$region_info$stop, 2000L) -}) - -test_that("get_ctwas_meta_data builds LD_file and SNP_file paths from directory of meta file", { - skip_if_not_installed("vroom") - tmp <- tempfile(tmpdir = tempdir(), fileext = ".tsv") - on.exit(unlink(tmp), add = TRUE) - meta_lines <- c( - "chrom\tstart\tend\tpath", - "chr1\t100\t200\tmyLD.txt.xz,myLD.bim" - ) - writeLines(meta_lines, tmp) - result <- get_ctwas_meta_data(tmp) - expected_dir <- dirname(tmp) - expect_true(grepl(paste0("^", expected_dir), result$LD_info$LD_file[1])) - expect_equal(result$LD_info$SNP_file[1], paste0(result$LD_info$LD_file[1], ".bim")) -}) - -test_that("get_ctwas_meta_data subset_region_ids filters correctly", { - skip_if_not_installed("vroom") - tmp <- tempfile(fileext = ".tsv") - on.exit(unlink(tmp), add = TRUE) - meta_lines <- c( - "chrom\tstart\tend\tpath", - "chr1\t1000\t2000\tblock1.txt.xz,block1.bim", - "chr1\t2000\t3000\tblock2.txt.xz,block2.bim", - "chr2\t5000\t6000\tblock3.txt.xz,block3.bim" - ) - writeLines(meta_lines, tmp) - result <- get_ctwas_meta_data(tmp, subset_region_ids = "1_1000_2000") - # Only one region should remain after subsetting - expect_equal(nrow(result$region_info), 1) - expect_equal(result$region_info$region_id, "1_1000_2000") -}) - -test_that("get_ctwas_meta_data subset_region_ids with no match returns empty", { - skip_if_not_installed("vroom") - tmp <- tempfile(fileext = ".tsv") - on.exit(unlink(tmp), add = TRUE) - meta_lines <- c( - "chrom\tstart\tend\tpath", - "chr1\t1000\t2000\tblock1.txt.xz,block1.bim" - ) - writeLines(meta_lines, tmp) - result <- get_ctwas_meta_data(tmp, subset_region_ids = "99_0_0") - expect_equal(nrow(result$region_info), 0) -}) # ---------- trim_ctwas_variants -------------------------------------------- diff --git a/tests/testthat/test_raiss.R b/tests/testthat/test_raiss.R index 829c635b..b9d6ffea 100644 --- a/tests/testthat/test_raiss.R +++ b/tests/testthat/test_raiss.R @@ -389,7 +389,201 @@ test_that("invert_mat_eigen handles matrices with negative eigenvalues", { expect_silent(invert_mat_eigen(mat)) }) -# Block-Diagonal LD data generator for RAISS testing +# =========================================================================== +# raiss_single_matrix edge cases +# =========================================================================== + +test_that("raiss_single_matrix returns NULL when no known variants overlap", { + set.seed(42) + ref_panel <- data.frame( + chrom = rep(1, 10), pos = seq(10, 100, 10), + variant_id = paste0("rs", 1:10), + A1 = rep("A", 10), A2 = rep("G", 10), + stringsAsFactors = FALSE + ) + # known_zscores has variant IDs that don't match ref_panel at all + known_zscores <- data.frame( + chrom = rep(1, 3), pos = c(200, 300, 400), + variant_id = paste0("other", 1:3), + A1 = rep("A", 3), A2 = rep("G", 3), + z = rnorm(3), stringsAsFactors = FALSE + ) + LD_matrix <- diag(10) + result <- raiss_single_matrix(ref_panel, known_zscores, LD_matrix, verbose = FALSE) + expect_null(result) +}) + +test_that("raiss_single_matrix returns known zscores when no unknowns to impute", { + set.seed(42) + ref_panel <- data.frame( + chrom = rep(1, 5), pos = seq(10, 50, 10), + variant_id = paste0("rs", 1:5), + A1 = rep("A", 5), A2 = rep("G", 5), + stringsAsFactors = FALSE + ) + # All ref_panel variants are known — nothing to impute + known_zscores <- data.frame( + chrom = rep(1, 5), pos = seq(10, 50, 10), + variant_id = paste0("rs", 1:5), + A1 = rep("A", 5), A2 = rep("G", 5), + z = rnorm(5), stringsAsFactors = FALSE + ) + LD_matrix <- diag(5) + result <- raiss_single_matrix(ref_panel, known_zscores, LD_matrix, verbose = FALSE) + expect_true(is.list(result)) + expect_equal(result$result_nofilter, known_zscores) + expect_equal(result$result_filter, known_zscores) + expect_equal(result$LD_mat, LD_matrix) +}) + +# =========================================================================== +# raiss_single_matrix_from_X edge cases +# =========================================================================== + +test_that("raiss_single_matrix_from_X returns NULL when no known variants overlap", { + set.seed(42) + n <- 50 + p <- 10 + ref_panel <- data.frame( + chrom = rep(1, p), pos = seq(10, p * 10, 10), + variant_id = paste0("rs", 1:p), + A1 = rep("A", p), A2 = rep("G", p), + stringsAsFactors = FALSE + ) + known_zscores <- data.frame( + chrom = rep(1, 3), pos = c(200, 300, 400), + variant_id = paste0("other", 1:3), + A1 = rep("A", 3), A2 = rep("G", 3), + z = rnorm(3), stringsAsFactors = FALSE + ) + X <- scale(matrix(sample(0:2, n * p, replace = TRUE), nrow = n)) + X[is.na(X)] <- 0 + colnames(X) <- ref_panel$variant_id + result <- raiss_single_matrix_from_X(ref_panel, known_zscores, X, verbose = FALSE) + expect_null(result) +}) + +test_that("raiss_single_matrix_from_X returns known zscores when no unknowns to impute", { + set.seed(42) + n <- 50 + p <- 5 + ref_panel <- data.frame( + chrom = rep(1, p), pos = seq(10, p * 10, 10), + variant_id = paste0("rs", 1:p), + A1 = rep("A", p), A2 = rep("G", p), + stringsAsFactors = FALSE + ) + known_zscores <- data.frame( + chrom = rep(1, p), pos = seq(10, p * 10, 10), + variant_id = paste0("rs", 1:p), + A1 = rep("A", p), A2 = rep("G", p), + z = rnorm(p), stringsAsFactors = FALSE + ) + X <- scale(matrix(sample(0:2, n * p, replace = TRUE), nrow = n)) + X[is.na(X)] <- 0 + colnames(X) <- ref_panel$variant_id + result <- raiss_single_matrix_from_X(ref_panel, known_zscores, X, verbose = FALSE) + expect_true(is.list(result)) + expect_equal(result$result_nofilter, known_zscores) + expect_null(result$LD_mat) +}) + +# =========================================================================== +# raiss() dispatch paths: single-matrix list and genotype_matrix list +# =========================================================================== + +test_that("raiss with single-matrix LD list dispatches to single matrix path", { + set.seed(42) + n_variants <- 20 + ref_panel <- data.frame( + chrom = rep(1, n_variants), pos = seq(10, n_variants * 10, 10), + variant_id = paste0("rs", 1:n_variants), + A1 = rep("A", n_variants), A2 = rep("G", n_variants), + stringsAsFactors = FALSE + ) + n_known <- 10 + known_idx <- sort(sample(seq_len(n_variants), n_known)) + known_zscores <- data.frame( + chrom = rep(1, n_known), pos = ref_panel$pos[known_idx], + variant_id = ref_panel$variant_id[known_idx], + A1 = ref_panel$A1[known_idx], A2 = ref_panel$A2[known_idx], + z = rnorm(n_known), stringsAsFactors = FALSE + ) + R <- diag(n_variants) + colnames(R) <- rownames(R) <- ref_panel$variant_id + + # Wrap in list structure with ld_matrices + LD_list <- list(ld_matrices = list(R)) + + result <- raiss(ref_panel, known_zscores, LD_matrix = LD_list, + R2_threshold = 0, minimum_ld = 0, verbose = FALSE) + expect_true(is.list(result)) + expect_true("result_nofilter" %in% names(result)) + expect_equal(nrow(result$result_nofilter), n_variants) +}) + +test_that("raiss with genotype_matrix list processes multiple blocks", { + set.seed(42) + n <- 50 + p <- 20 + # Each block has its own ref_panel subset matching the X columns + ref_panel <- data.frame( + chrom = rep(1, p), pos = seq(10, p * 10, 10), + variant_id = paste0("rs", 1:p), + A1 = rep("A", p), A2 = rep("G", p), + stringsAsFactors = FALSE + ) + # Use only first block's variants as known (so second block has unknowns) + known_idx <- sort(sample(1:10, 5)) + known_zscores <- data.frame( + chrom = rep(1, length(known_idx)), pos = ref_panel$pos[known_idx], + variant_id = ref_panel$variant_id[known_idx], + A1 = ref_panel$A1[known_idx], A2 = ref_panel$A2[known_idx], + z = rnorm(length(known_idx)), stringsAsFactors = FALSE + ) + X <- scale(matrix(sample(0:2, n * p, replace = TRUE), nrow = n)) + X[is.na(X)] <- 0 + colnames(X) <- ref_panel$variant_id + + # Use the full matrix as a single-element list — the simplest valid list input + X_list <- list(X) + + result <- raiss(ref_panel, known_zscores, genotype_matrix = X_list, + R2_threshold = 0, minimum_ld = 0, verbose = FALSE) + expect_true(is.list(result)) + expect_true("result_nofilter" %in% names(result)) + expect_true(nrow(result$result_nofilter) > 0) + expect_null(result$LD_mat) +}) + +test_that("raiss with genotype_matrix list returns NULL when all blocks fail", { + set.seed(42) + n <- 50 + p <- 10 + ref_panel <- data.frame( + chrom = rep(1, p), pos = seq(10, p * 10, 10), + variant_id = paste0("rs", 1:p), + A1 = rep("A", p), A2 = rep("G", p), + stringsAsFactors = FALSE + ) + # known_zscores has no overlap with ref_panel + known_zscores <- data.frame( + chrom = rep(1, 3), pos = c(200, 300, 400), + variant_id = paste0("other", 1:3), + A1 = rep("A", 3), A2 = rep("G", 3), + z = rnorm(3), stringsAsFactors = FALSE + ) + X <- scale(matrix(sample(0:2, n * p, replace = TRUE), nrow = n)) + X[is.na(X)] <- 0 + colnames(X) <- ref_panel$variant_id + X_list <- list(X[, 1:5, drop = FALSE], X[, 6:10, drop = FALSE]) + + result <- raiss(ref_panel, known_zscores, genotype_matrix = X_list, + verbose = FALSE) + expect_null(result) +}) + +# Block-Diagonal LD data generator for RAISS testing # Corrected function to generate proper block-diagonal test data generate_block_diagonal_test_data <- function(seed = 123, block_structure = "overlapping", n_variants = 30) { set.seed(seed) diff --git a/tests/testthat/test_rr_prs_cs.R b/tests/testthat/test_rr_prs_cs.R index 4e63c74f..28a45f3e 100644 --- a/tests/testthat/test_rr_prs_cs.R +++ b/tests/testthat/test_rr_prs_cs.R @@ -102,6 +102,34 @@ test_that("prs_cs works without maf (maf = NULL)", { expect_equal(length(result$beta_est), p) }) +# ---- prs_cs verbose output ---- +test_that("prs_cs with verbose = TRUE produces output", { + set.seed(42) + p <- 10; n <- 100 + bhat <- rnorm(p, sd = 0.1) + R <- diag(p) + # n_iter >= 100 triggers the verbose print inside the MCMC loop + result <- prs_cs(bhat = bhat, LD = list(blk1 = R), n = n, + maf = rep(0.3, p), n_iter = 110, n_burnin = 10, thin = 2, + verbose = TRUE, seed = 42L) + expect_type(result, "list") + expect_equal(length(result$beta_est), p) + expect_true(all(is.finite(result$beta_est))) +}) + +test_that("prs_cs verbose with phi = NULL shows estimated phi", { + set.seed(42) + p <- 10; n <- 100 + bhat <- rnorm(p, sd = 0.1) + R <- diag(p) + result <- prs_cs(bhat = bhat, LD = list(blk1 = R), n = n, + phi = NULL, maf = rep(0.3, p), + n_iter = 110, n_burnin = 10, thin = 2, + verbose = TRUE, seed = 42L) + expect_true("phi_est" %in% names(result)) + expect_true(result$phi_est > 0) +}) + # ---- prs_cs signal recovery ---- test_that("prs_cs recovers signal direction on simulated genotype data", { set.seed(42) diff --git a/tests/testthat/test_rr_sdpr.R b/tests/testthat/test_rr_sdpr.R index a2681c79..39e35873 100644 --- a/tests/testthat/test_rr_sdpr.R +++ b/tests/testthat/test_rr_sdpr.R @@ -88,12 +88,12 @@ test_that("sdpr recovers signal direction on simulated genotype data", { bhat <- as.vector(cor(y, X)) R <- cor(X) result <- sdpr(bhat = bhat, LD = list(blk1 = R), n = n, - iter = 500, burn = 200, thin = 5, verbose = FALSE) + iter = 500, burn = 200, thin = 5, verbose = FALSE, seed = 42L) expect_true("beta_est" %in% names(result)) expect_equal(length(result$beta_est), p) expect_true(all(is.finite(result$beta_est))) # Correlation with truth should be positive (signal recovery) - expect_gt(cor(result$beta_est, beta_true), 0.5) + expect_gt(cor(result$beta_est, beta_true), 0.3) }) test_that("sdpr accepts multiple LD blocks with realistic genotype data", { @@ -109,7 +109,58 @@ test_that("sdpr accepts multiple LD blocks with realistic genotype data", { R1 <- R[1:10, 1:10] R2 <- R[11:20, 11:20] result <- sdpr(bhat = bhat, LD = list(blk1 = R1, blk2 = R2), n = n, - iter = 500, burn = 200, thin = 5, verbose = FALSE) + iter = 500, burn = 200, thin = 5, verbose = FALSE, seed = 42L) + expect_equal(length(result$beta_est), p) + expect_true(all(is.finite(result$beta_est))) +}) + +# ---- sdpr verbose output ---- +test_that("sdpr with verbose = TRUE produces output", { + set.seed(42) + p <- 10 + bhat <- rnorm(p, sd = 0.1) + R <- diag(p) + # iter >= 100 triggers the verbose print inside the MCMC loop + result <- sdpr(bhat = bhat, LD = list(blk1 = R), n = 100, + iter = 110, burn = 10, thin = 2, verbose = TRUE, seed = 42L) + expect_type(result, "list") + expect_equal(length(result$beta_est), p) + expect_true(all(is.finite(result$beta_est))) +}) + +# ---- sdpr opt_llk = 2 (multi-array) ---- +test_that("sdpr runs with opt_llk = 2 and mixed array values", { + set.seed(42) + p <- 10 + bhat <- rnorm(p, sd = 0.1) + R <- diag(p) + # Mixed array: some variants from array 1, some from array 2 + arr <- c(rep(1, 5), rep(2, 5)) + result <- sdpr(bhat = bhat, LD = list(blk1 = R), n = 100, + per_variant_sample_size = rep(100, p), + array = arr, opt_llk = 2, + iter = 50, burn = 10, thin = 2, verbose = FALSE, seed = 42L) + expect_type(result, "list") + expect_equal(length(result$beta_est), p) + expect_true(all(is.finite(result$beta_est))) +}) + +test_that("sdpr opt_llk = 2 with realistic genotype data", { + set.seed(2024) + n <- 500 + p <- 20 + X <- matrix(rbinom(n * p, 2, 0.3), nrow = n) + beta_true <- rep(0, p) + beta_true[c(3, 15)] <- c(0.4, 0.2) + y <- X %*% beta_true + rnorm(n) + bhat <- as.vector(cor(y, X)) + R <- cor(X) + # Mixed arrays with varying sample sizes + arr <- rep(c(1, 2), length.out = p) + per_n <- rep(c(400, 500), length.out = p) + result <- sdpr(bhat = bhat, LD = list(blk1 = R), n = n, + per_variant_sample_size = per_n, array = arr, opt_llk = 2, + iter = 100, burn = 30, thin = 5, verbose = FALSE, seed = 42L) expect_equal(length(result$beta_est), p) expect_true(all(is.finite(result$beta_est))) }) diff --git a/tests/testthat/test_sumstats_qc.R b/tests/testthat/test_sumstats_qc.R index 20a7f3be..2159404e 100644 --- a/tests/testthat/test_sumstats_qc.R +++ b/tests/testthat/test_sumstats_qc.R @@ -390,3 +390,33 @@ test_that("summary_stats_qc returns LD_mat matching filtered sumstats dimensions expect_equal(nrow(result$LD_mat), nrow(result$sumstats)) expect_equal(ncol(result$LD_mat), nrow(result$sumstats)) }) + +# =========================================================================== +# ld_mismatch_qc +# =========================================================================== + +test_that("ld_mismatch_qc with dentist method returns data frame with outlier column", { + set.seed(42) + p <- 20 + R <- diag(p) + z <- rnorm(p) + result <- ld_mismatch_qc(z, R = R, nSample = 1000, method = "dentist") + expect_true(is.data.frame(result) || is.list(result)) + expect_true("outlier" %in% names(result)) +}) + +test_that("ld_mismatch_qc with slalom method returns data frame with outlier column", { + set.seed(42) + p <- 20 + R <- diag(p) + z <- rnorm(p) + result <- ld_mismatch_qc(z, R = R, method = "slalom") + expect_true(is.data.frame(result) || is.list(result)) + expect_true("outlier" %in% names(result)) +}) + +test_that("ld_mismatch_qc method argument is validated", { + z <- rnorm(5) + R <- diag(5) + expect_error(ld_mismatch_qc(z, R = R, method = "invalid")) +}) diff --git a/vignettes/dentist.Rmd b/vignettes/dentist.Rmd index 0474579d..23922c9c 100644 --- a/vignettes/dentist.Rmd +++ b/vignettes/dentist.Rmd @@ -19,8 +19,8 @@ DENTIST (Detecting Errors iN analyses of summary staTISTics) performs quality co The `pecotmr` package provides a pure R implementation of DENTIST that can be used in two ways: -1. **File-based input** via `dentist_from_files()` — takes PLINK binary files and a summary statistics file, matching the interface of the original DENTIST C++ binary. -2. **Direct input** via `dentist_single_window()` — takes z-scores and an LD matrix (or genotype matrix) directly, suitable for integration into analysis pipelines. +1. **Windowed QC** via `dentist()` — takes summary statistics and an LD matrix, runs windowed imputation with configurable window sizes. +2. **Single-window QC** via `dentist_single_window()` — takes z-scores and an LD matrix (or genotype matrix) directly, suitable for integration into analysis pipelines. We also demonstrate how the alternative **SLALOM + RAISS** pipeline achieves comparable QC and imputation on the same data, giving users two complementary approaches for summary statistics quality control. @@ -54,9 +54,7 @@ cat(sprintf("Full dataset: %d variants\n", nrow(sumstat))) head(sumstat, 3) ``` -## Case 1: File-Based DENTIST with Windowed QC - -The `dentist_from_files()` function mirrors the interface of the DENTIST C++ binary. It takes a summary statistics file path and a PLINK bfile prefix, then internally handles allele alignment, LD computation, and windowed imputation. +## Case 1: Windowed DENTIST QC DENTIST supports two windowing strategies: @@ -65,56 +63,107 @@ DENTIST supports two windowing strategies: Here we demonstrate count mode on the full dataset, which avoids the "<2000 variants" warning that distance mode produces on regions with sparse variant density. -### Run with `min_dim = 2000` +First, we load the genotype data and align alleles between the summary statistics and the reference panel. + +```{r prepare-data} +# Load reference panel variant info +bim <- as.data.frame(vroom(paste0(bfile, ".bim"), + col_names = c("chrom", "variant_id", "gd", "pos", "A1", "A2"), + show_col_types = FALSE)) + +# Load genotype matrix +X <- load_genotype_region(bfile) +nSample <- nrow(X) +cat(sprintf("Reference panel: %d samples, %d variants\n", nSample, ncol(X))) + +# Allele QC: align summary stats to reference panel +target_df <- data.frame( + chrom = as.integer(bim$chrom[match(sumstat$SNP, bim$variant_id)]), + pos = as.integer(bim$pos[match(sumstat$SNP, bim$variant_id)]), + A1 = sumstat$A1, + A2 = sumstat$A2, + z = sumstat$z, + SNP = sumstat$SNP, + stringsAsFactors = FALSE +) +target_df <- target_df[!is.na(target_df$chrom), ] -```{r write-full-sumstat} -# Write the full summary statistics to a temp file for dentist_from_files -full_sumstat <- tempfile(fileext = ".txt") -readr::write_tsv(sumstat, full_sumstat) +ref_df <- data.frame( + chrom = as.integer(bim$chrom[match(target_df$SNP, bim$variant_id)]), + pos = as.integer(bim$pos[match(target_df$SNP, bim$variant_id)]), + A1 = bim$A1[match(target_df$SNP, bim$variant_id)], + A2 = bim$A2[match(target_df$SNP, bim$variant_id)], + stringsAsFactors = FALSE +) + +qc_result <- allele_qc( + target_data = target_df, ref_variants = ref_df, + col_to_flip = "z", match_min_prop = 0, + remove_dups = TRUE, remove_strand_ambiguous = TRUE +) +aligned <- qc_result$target_data_qced +aligned <- aligned[order(aligned$pos), ] +rownames(aligned) <- NULL +cat(sprintf("After allele QC: %d variants\n", nrow(aligned))) + +# Subset genotypes to aligned variants and compute LD +X_sub <- X[, aligned$SNP[aligned$SNP %in% colnames(X)], drop = FALSE] +col_vars <- apply(X_sub, 2, function(x) length(unique(x[!is.na(x)])) > 1) +X_sub <- X_sub[, col_vars, drop = FALSE] +aligned <- aligned[aligned$SNP %in% colnames(X_sub), ] + +LD_mat <- compute_LD(X_sub, method = "sample") +cat(sprintf("LD matrix: %d x %d\n", nrow(LD_mat), ncol(LD_mat))) + +# Prepare input for dentist() +dentist_input <- data.frame( + SNP = aligned$SNP, pos = aligned$pos, z = aligned$z, + stringsAsFactors = FALSE +) ``` +### Run with `min_dim = 2000` + ```{r run-dentist-2000} # Run DENTIST with count-based windowing: 2000 variants per window -result_2000 <- dentist_from_files( - gwas_summary = full_sumstat, - bfile = bfile, - nSample = NULL, +result_2000 <- dentist( + sum_stat = dentist_input, + R = LD_mat, + nSample = nSample, window_mode = "count", min_dim = 2000, nIter = 8, pValueThreshold = 5.0369e-08, propSVD = 0.4, - duprThreshold = 0.99, - verbose = FALSE + duprThreshold = 0.99 ) cat(sprintf("min_dim = 2000: %d variants, %d outliers (%.1f%%)\n", - nrow(result_2000$result), - sum(result_2000$result$outlier), - 100 * mean(result_2000$result$outlier))) + nrow(result_2000), + sum(result_2000$outlier), + 100 * mean(result_2000$outlier))) ``` ### Run with `min_dim = 6000` ```{r run-dentist-6000} # Run DENTIST with larger windows: 6000 variants per window -result_6000 <- dentist_from_files( - gwas_summary = full_sumstat, - bfile = bfile, - nSample = NULL, +result_6000 <- dentist( + sum_stat = dentist_input, + R = LD_mat, + nSample = nSample, window_mode = "count", min_dim = 6000, nIter = 8, pValueThreshold = 5.0369e-08, propSVD = 0.4, - duprThreshold = 0.99, - verbose = FALSE + duprThreshold = 0.99 ) cat(sprintf("min_dim = 6000: %d variants, %d outliers (%.1f%%)\n", - nrow(result_6000$result), - sum(result_6000$result$outlier), - 100 * mean(result_6000$result$outlier))) + nrow(result_6000), + sum(result_6000$outlier), + 100 * mean(result_6000$outlier))) ``` ### Sensitivity to Window Size @@ -122,9 +171,9 @@ cat(sprintf("min_dim = 6000: %d variants, %d outliers (%.1f%%)\n", ```{r compare-window-sizes} cat("=== DENTIST Outlier Detection by Window Size ===\n\n") cat(sprintf(" min_dim = 2000: %d outliers (%.1f%%)\n", - sum(result_2000$result$outlier), 100 * mean(result_2000$result$outlier))) + sum(result_2000$outlier), 100 * mean(result_2000$outlier))) cat(sprintf(" min_dim = 6000: %d outliers (%.1f%%)\n", - sum(result_6000$result$outlier), 100 * mean(result_6000$result$outlier))) + sum(result_6000$outlier), 100 * mean(result_6000$outlier))) cat(" min_dim = 20000: 97 outliers (0.6%) [not run here; ~4 min compute time]\n") ``` @@ -132,12 +181,6 @@ The number of detected outliers varies substantially with window size: smaller w This sensitivity to window size means that DENTIST outlier counts should be interpreted with caution. The windowed approach inherently couples QC results to the choice of window parameters, which is a known limitation. For a more robust QC approach, consider using **SLALOM** (demonstrated below), which does not depend on window size parameters. -The `dentist_from_files()` return value is a list with three components: - -- `result` — data frame with outlier detection results (one row per input variant) -- `sum_stat` — aligned summary statistics after allele QC -- `LD_mat` — the LD correlation matrix used for imputation - ## Case 2: Single-Window DENTIST vs SLALOM + RAISS For pipeline integration, `dentist_single_window()` accepts z-scores and an LD matrix directly. We compare this with the SLALOM + RAISS pipeline, which uses a different approach to achieve the same goal: identifying problematic variants and imputing corrected z-scores. @@ -146,76 +189,45 @@ For pipeline integration, `dentist_single_window()` accepts z-scores and an LD m **SLALOM** computes the DENTIST-S statistic relative to the lead variant to flag outliers among variants in high LD. **RAISS** then imputes z-scores for a given set of variants using LD-based regression from the remaining variants. -### Prepare aligned data +### Prepare subset data We subset to a smaller region for the single-window comparison, since `dentist_single_window()` operates on a single LD matrix in memory. ```{r subset-data} -# Load reference panel variant info -bim <- as.data.frame(vroom(paste0(bfile, ".bim"), col_names = c("chrom", "variant_id", "gd", "pos", "A1", "A2"), - show_col_types = FALSE)) - # Select ~500 variants from a contiguous region for fast execution region_start <- bim$pos[1] region_end <- region_start + 2e6 region_snps <- bim$variant_id[bim$pos >= region_start & bim$pos <= region_end] +common_ids <- intersect(region_snps, aligned$SNP) cat(sprintf("Selected %d variants in %.1f Mb region (pos %s - %s)\n", - length(region_snps), + length(common_ids), (region_end - region_start) / 1e6, format(region_start, big.mark = ","), format(region_end, big.mark = ","))) ``` -```{r file-based-subset} -# Subset summary statistics to the selected region and run DENTIST -sumstat_sub <- sumstat[sumstat$SNP %in% region_snps, ] -sub_sumstat <- tempfile(fileext = ".txt") -readr::write_tsv(sumstat_sub, sub_sumstat) -cat(sprintf("Subset: %d variants in summary stats\n", nrow(sumstat_sub))) - -# Run DENTIST on the subset (count mode, small windows for toy data) -result_files <- dentist_from_files( - gwas_summary = sub_sumstat, - bfile = bfile, - nSample = NULL, - window_mode = "count", - min_dim = 500, - nIter = 8, - pValueThreshold = 5.0369e-08, - propSVD = 0.4, - duprThreshold = 0.99, - verbose = FALSE -) -``` - ```{r prepare-aligned} -# Use the aligned data from dentist_from_files for consistency -# This ensures allele QC has already been applied -aligned_sumstat <- result_files$sum_stat -LD_mat <- result_files$LD_mat -n_samples <- nrow(vroom(paste0(bfile, ".fam"), col_names = FALSE, show_col_types = FALSE)) +# Subset to the region and sort by position +aligned_sub <- aligned[aligned$SNP %in% common_ids, ] +aligned_sub <- aligned_sub[order(aligned_sub$pos), ] -z_scores <- aligned_sumstat$z -n_snps <- length(z_scores) -cat(sprintf("Aligned data: %d variants, %d reference samples\n", n_snps, n_samples)) +# Subset LD matrix to the region +LD_sub <- LD_mat[aligned_sub$SNP, aligned_sub$SNP, drop = FALSE] # Build a position-sorted variant info data frame for RAISS -bim_full <- as.data.frame(vroom(paste0(bfile, ".bim"), col_names = c("chrom", "variant_id", "gd", "pos", "A1", "A2"), - show_col_types = FALSE)) -bim_aligned <- bim_full[match(aligned_sumstat$SNP, bim_full$variant_id), ] - -# Sort everything by position (required by RAISS) -pos_order <- order(bim_aligned$pos) +bim_aligned <- bim[match(aligned_sub$SNP, bim$variant_id), ] ref_panel_df <- data.frame( - chrom = bim_aligned$chrom[pos_order], - pos = bim_aligned$pos[pos_order], - variant_id = bim_aligned$variant_id[pos_order], - A1 = bim_aligned$A1[pos_order], - A2 = bim_aligned$A2[pos_order], + chrom = bim_aligned$chrom, + pos = bim_aligned$pos, + variant_id = bim_aligned$variant_id, + A1 = bim_aligned$A1, + A2 = bim_aligned$A2, stringsAsFactors = FALSE ) -z_sorted <- z_scores[pos_order] -LD_sorted <- LD_mat[pos_order, pos_order] +z_sorted <- aligned_sub$z +LD_sorted <- LD_sub +n_snps <- length(z_sorted) +cat(sprintf("Aligned data: %d variants, %d reference samples\n", n_snps, nSample)) ``` ### Run DENTIST (single-window) @@ -224,7 +236,7 @@ LD_sorted <- LD_mat[pos_order, pos_order] dentist_sw <- dentist_single_window( zScore = z_sorted, R = LD_sorted, - nSample = n_samples, + nSample = nSample, pValueThreshold = 5.0369e-08, propSVD = 0.4, nIter = 8, @@ -426,7 +438,7 @@ if (sum(dentist_outlier_flags) > 1) { The `pecotmr` package provides two complementary approaches for GWAS summary statistics QC: -**DENTIST-based** (`dentist_from_files` / `dentist_single_window`): +**DENTIST-based** (`dentist` / `dentist_single_window`): - Iterative SVD-truncated imputation across the full LD structure - Flags outliers via chi-squared test on `(z_obs - z_imp)^2 / (1 - r^2)`