StatFunGen · danielnachun · Apr 14, 2026 · Apr 13, 2026 · Apr 13, 2026 · Apr 14, 2026
diff --git a/.github/recipe/recipe.yaml b/.github/recipe/recipe.yaml
@@ -23,6 +23,7 @@ requirements:
     - ${{ stdlib('c') }}
     - ${{ compiler('cxx') }}
   host:
+    - bioconductor-biostrings
     - bioconductor-iranges
     - bioconductor-qvalue
     - bioconductor-s4vectors
@@ -69,6 +70,7 @@ requirements:
     - r-vroom
     - r-xicor
   run:
+    - bioconductor-biostrings
     - bioconductor-iranges
     - bioconductor-qvalue
     - bioconductor-s4vectors

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -36,7 +36,7 @@ jobs:
       - name: Create TOML from recipe
         run: |
           .github/workflows/create_toml_from_yaml.sh ${GITHUB_WORKSPACE}
-          mkdir /tmp/pixi
+          mkdir -p /tmp/pixi
           mv ${GITHUB_WORKSPACE}/pixi.toml /tmp/pixi
 
       - name: Setup pixi
@@ -45,7 +45,9 @@ jobs:
           manifest-path: /tmp/pixi/pixi.toml
 
       - name: Run unit tests
-        run: pixi run --environment ${{ matrix.environment }} --manifest-path /tmp/pixi/pixi.toml devtools_test
+        run: |
+          pixi config set --manifest-path /tmp/pixi/pixi.toml --local run-post-link-scripts insecure
+          pixi run --environment ${{ matrix.environment }} --manifest-path /tmp/pixi/pixi.toml devtools_test
 
       - name: Check unit test code coverage
         if: matrix.platform == 'linux-64' && matrix.environment == 'r44'

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -14,6 +14,7 @@ Authors@R: c(person("Gao Wang",role = c("cre","aut"),
                   person("StatFunGen Lab", role = "ctb"))
 License: MIT + file LICENSE
 Imports:
+    Biostrings,
     IRanges,
     Rcpp,
     S4Vectors,

diff --git a/NAMESPACE b/NAMESPACE
@@ -23,9 +23,7 @@ export(coloc_wrapper)
 export(colocboost_analysis_pipeline)
 export(compute_LD)
 export(compute_qtl_enrichment)
-export(ctwas_bimfile_loader)
 export(dentist)
-export(dentist_from_files)
 export(dentist_single_window)
 export(dpr_weights)
 export(enet_weights)
@@ -39,7 +37,6 @@ export(find_data)
 export(find_duplicate_variants)
 export(fsusie_get_cs)
 export(fsusie_wrapper)
-export(get_ctwas_meta_data)
 export(get_filter_lbf_index)
 export(get_nested_element)
 export(get_ref_variant_info)
@@ -54,6 +51,7 @@ export(lassosum_rss)
 export(lassosum_rss_weights)
 export(lbf_to_alpha)
 export(ld_loader)
+export(ld_mismatch_qc)
 export(load_LD_matrix)
 export(load_genotype_region)
 export(load_multicontext_sumstats)
@@ -89,7 +87,6 @@ export(normalize_variant_id)
 export(otters_association)
 export(otters_weights)
 export(parse_cs_corr)
-export(parse_dentist_output)
 export(parse_region)
 export(parse_variant_id)
 export(perform_qr_analysis)
@@ -99,7 +96,6 @@ export(qr_screen)
 export(quantile_twas_weight_pipeline)
 export(raiss)
 export(read_afreq)
-export(read_dentist_sumstat)
 export(region_to_df)
 export(rss_analysis_pipeline)
 export(rss_basic_qc)

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -1,24 +1,6 @@
 # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
-#' Compute LD matrix using GCTA-style formula matching the DENTIST binary.
-#'
-#' This function computes pairwise Pearson correlations from a genotype matrix
-#' using the exact same formula and floating-point operation order as the
-#' original DENTIST C++ binary (calcLDFromBfile_gcta in bfileOperations.cpp).
-#' Key differences from R's crossprod-based approach:
-#'   1. Accumulates integer-valued sums sequentially (exact intermediate values)
-#'   2. Handles per-pair missing data with GCTA correction formula
-#'   3. Divides by nKeptSample (trimmed to multiple of 4), not nrow(X)
-#'
-#' @param X Numeric genotype matrix (samples x SNPs), values in {0, 1, 2, NA}.
-#'          Rows should already be trimmed to a multiple of 4.
-#' @param ncpus Number of CPU threads for parallel computation.
-#' @return Symmetric LD correlation matrix with 1.0 on diagonal.
-compute_LD_gcta_cpp <- function(X, ncpus = 1L) {
-    .Call('_pecotmr_compute_LD_gcta_cpp', PACKAGE = 'pecotmr', X, ncpus)
-}
-
 dentist_iterative_impute <- function(LD_mat, nSample, zScore, pValueThreshold, propSVD, gcControl, nIter, gPvalueThreshold, ncpus, correct_chen_et_al_bug, verbose = FALSE) {
     .Call('_pecotmr_dentist_iterative_impute', PACKAGE = 'pecotmr', LD_mat, nSample, zScore, pValueThreshold, propSVD, gcControl, nIter, gPvalueThreshold, ncpus, correct_chen_et_al_bug, verbose)
 }
@@ -35,7 +17,7 @@ qtl_enrichment_rcpp <- function(r_gwas_pip, r_qtl_susie_fit, pi_gwas = 0, pi_qtl
     .Call('_pecotmr_qtl_enrichment_rcpp', PACKAGE = 'pecotmr', r_gwas_pip, r_qtl_susie_fit, pi_gwas, pi_qtl, ImpN, shrinkage_lambda, num_threads)
 }
 
-sdpr_rcpp <- function(bhat, LD, n, per_variant_sample_size = NULL, array = NULL, a = 0.1, c = 1.0, M = 1000L, a0k = 0.5, b0k = 0.5, iter = 1000L, burn = 200L, thin = 5L, n_threads = 1L, opt_llk = 1L, verbose = TRUE) {
-    .Call('_pecotmr_sdpr_rcpp', PACKAGE = 'pecotmr', bhat, LD, n, per_variant_sample_size, array, a, c, M, a0k, b0k, iter, burn, thin, n_threads, opt_llk, verbose)
+sdpr_rcpp <- function(bhat, LD, n, per_variant_sample_size = NULL, array = NULL, a = 0.1, c = 1.0, M = 1000L, a0k = 0.5, b0k = 0.5, iter = 1000L, burn = 200L, thin = 5L, n_threads = 1L, opt_llk = 1L, verbose = TRUE, seed = NULL) {
+    .Call('_pecotmr_sdpr_rcpp', PACKAGE = 'pecotmr', bhat, LD, n, per_variant_sample_size, array, a, c, M, a0k, b0k, iter, burn, thin, n_threads, opt_llk, verbose, seed)
 }
 
diff --git a/R/allele_qc.R b/R/allele_qc.R
@@ -27,27 +27,9 @@ allele_qc <- function(target_data, ref_variants, col_to_flip = NULL,
                      remove_indels = FALSE, remove_strand_ambiguous = TRUE,
                      flip_strand = FALSE, remove_unmatched = TRUE, ...) {
 	strand_flip <- function(ref) {
-	# Define a mapping for complementary bases
-	base_mapping <- c("A" = "T", "T" = "A", "G" = "C", "C" = "G")
-
-	# Function to complement a single base
-	complement_base <- function(base) {
-	  complement <- base_mapping[base]
-	  return(complement)
-	}
-
-	# Function to reverse and complement a DNA sequence
-	reverse_complement <- function(sequence) {
-	  reversed <- rev(strsplit(sequence, NULL)[[1]])
-	  complemented <- sapply(reversed, complement_base, USE.NAMES = FALSE)
-	  return(paste(complemented, collapse = ""))
+	  as.character(Biostrings::reverseComplement(Biostrings::DNAStringSet(ref)))
 	}
 
-	complemented_sequence <- sapply(ref, reverse_complement, USE.NAMES = FALSE)
-
-	return(complemented_sequence)
-  }
-
   # helper to sanitize column names to avoid NA/empty names that break dplyr verbs
   sanitize_names <- function(df) {
 	 nm <- colnames(df)
@@ -67,15 +49,7 @@ allele_qc <- function(target_data, ref_variants, col_to_flip = NULL,
 
   # check if the pattern is ATCG/DI
   check_ATCG <- function(vec) {
-	pattern <- "^[ATCGDI]+$"
-
-	# Function to check if a single element matches the pattern
-	check_element <- function(element) {
-	  grepl(pattern, element)
-	}
-
-	result <- sapply(vec, check_element, USE.NAMES = FALSE)
-	return(result)
+	grepl("^[ATCGDI]+$", vec)
   }
 
   # transform all inputs to dataframe
@@ -100,7 +74,7 @@ allele_qc <- function(target_data, ref_variants, col_to_flip = NULL,
   if (any(columns_to_remove %in% colnames(target_data))) {
 	 target_data <- select(target_data, -any_of(columns_to_remove))
   }
-  match_result <- merge(target_data, ref_variants, by = c("chrom", "pos"), all = FALSE, suffixes = c(".target", ".ref")) %>% as.data.frame()
+  match_result <- inner_join(target_data, ref_variants, by = c("chrom", "pos"), suffix = c(".target", ".ref")) %>% as.data.frame()
 
   # sanitize names after merge as well (merge can introduce empty names in edge cases)
   match_result <- sanitize_names(match_result)
@@ -134,7 +108,7 @@ allele_qc <- function(target_data, ref_variants, col_to_flip = NULL,
   }
   # if all strand flip is un-ambigous, then we know ambigous cases are indeed a strand flip
   # not a sign flip, then we infer there is no ambigous in the whole dataset, and keep those ambigous ones
-  if (nrow(match_result %>% filter(strand_flip == TRUE) %>% filter(strand_unambiguous == TRUE)) == 0) {
+  if (!any(match_result$strand_flip & match_result$strand_unambiguous)) {
 	match_result <- match_result %>% mutate(strand_unambiguous = TRUE)
   }
 

diff --git a/R/ctwas_wrapper.R b/R/ctwas_wrapper.R
@@ -1,35 +1,5 @@
-#' @importFrom vroom vroom
-#' @export
-ctwas_bimfile_loader <- function(bim_file_path) {
-  snp_info <- as.data.frame(vroom(bim_file_path, col_names = FALSE))
-  if (ncol(snp_info) == 9) {
-    colnames(snp_info) <- c("chrom", "id", "GD", "pos", "A1", "A2", "variance", "allele_freq", "n_nomiss")
-  } else {
-    colnames(snp_info) <- c("chrom", "id", "GD", "pos", "A1", "A2")
-  }
-  snp_info$id <- normalize_variant_id(snp_info$id)
-  return(snp_info)
-}
-
-#' Utility function to format meta data dataframe for cTWAS analyses
-#' @importFrom vroom vroom
-#' @export
-get_ctwas_meta_data <- function(ld_meta_data_file, subset_region_ids = NULL) {
-  LD_info <- as.data.frame(vroom(ld_meta_data_file))
-  colnames(LD_info)[1] <- "chrom"
-  LD_info$region_id <- paste(as.integer(strip_chr_prefix(LD_info$chrom)), LD_info$start, LD_info$end, sep = "_")
-  LD_info$LD_file <- paste0(dirname(ld_meta_data_file), "/", gsub(",.*$", "", LD_info$path))
-  LD_info$SNP_file <- paste0(LD_info$LD_file, ".bim")
-  LD_info <- LD_info[, c("region_id", "LD_file", "SNP_file")]
-  region_info <- LD_info[, "region_id", drop = FALSE]
-  region_info$chrom <- as.integer(gsub("\\_.*$", "", region_info$region_id))
-  region_info$start <- as.integer(gsub("\\_.*$", "", sub("^.*?\\_", "", region_info$region_id)))
-  region_info$stop <- as.integer(sub("^.*?\\_", "", sub("^.*?\\_", "", region_info$region_id)))
-  region_info$region_id <- paste0(region_info$chrom, "_", region_info$start, "_", region_info$stop)
-  region_info <- region_info[, c("chrom", "start", "stop", "region_id")]
-  if (!is.null(subset_region_ids)) region_info <- region_info[region_info$region_id %in% subset_region_ids, ]
-  return(list(LD_info = LD_info, region_info = region_info))
-}
+### File-I/O functions (ctwas_bimfile_loader, get_ctwas_meta_data) have been
+### removed. Use ld_loader() and read_bim() from the standard I/O path instead.
 
 #' Function to select variants for ctwas weights input
 #' @param region_data A list of list containing weights list and snp_info list data for multiple genes/events within a single LD block region.