From 59f7e050e404e60c5e723351df5d0f12c73e5c66 Mon Sep 17 00:00:00 2001 From: Hao Sun Date: Thu, 23 Apr 2026 15:55:47 -0400 Subject: [PATCH 1/5] Fix OTTERS lassosum selector parity --- DESCRIPTION | 1 + R/otters.R | 30 +- R/regularized_regression.R | 360 ++++++++++++++++-- inst/notes/otters_lassosum_selector_fix_pr.md | 120 ++++++ tests/testthat/test_otters.R | 50 ++- tests/testthat/test_rr_dispatch.R | 112 +++++- tests/testthat/test_rr_lassosum.R | 32 +- 7 files changed, 660 insertions(+), 45 deletions(-) create mode 100644 inst/notes/otters_lassosum_selector_fix_pr.md diff --git a/DESCRIPTION b/DESCRIPTION index f460e17c..5e858b8c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -51,6 +51,7 @@ Suggests: glmnet, harmonicmeanp, knitr, + lassosum, mashr, mr.mashr, mvsusieR, diff --git a/R/otters.R b/R/otters.R index 142cd469..8f41ba90 100644 --- a/R/otters.R +++ b/R/otters.R @@ -50,7 +50,7 @@ #' R <- diag(p) #' sumstats <- data.frame(z = z) #' weights <- otters_weights(sumstats, R, n, -#' methods = list(lassosum_rss = list()), +#' methods = list(lassosum_rss = list(selection = "min_fbeta")), #' p_thresholds = c(0.05)) #' #' @export @@ -87,10 +87,28 @@ otters_weights <- function(sumstats, LD, n, p <- nrow(sumstats) z <- sumstats$z + chrom_col <- intersect(c("chrom", "CHROM"), names(sumstats)) + pos_col <- intersect(c("pos", "POS"), names(sumstats)) + a1_col <- intersect(c("A1", "a1"), names(sumstats)) + a2_col <- intersect(c("A2", "a2"), names(sumstats)) + variant_info <- if (length(chrom_col) && + length(pos_col) && + length(a1_col) && + length(a2_col)) { + data.frame( + chrom = sumstats[[chrom_col[[1]]]], + pos = sumstats[[pos_col[[1]]]], + A1 = sumstats[[a1_col[[1]]]], + A2 = sumstats[[a2_col[[1]]]], + stringsAsFactors = FALSE + ) + } else { + NULL + } # Build stat object for _weights() convention b <- z / sqrt(n) - stat <- list(b = b, n = rep(n, p)) + stat <- list(b = b, cor = b, z = z, n = rep(n, p)) results <- list() @@ -114,7 +132,13 @@ otters_weights <- function(sumstats, LD, n, next } tryCatch({ - w <- do.call(fn_name, c(list(stat = stat, LD = LD), methods[[method_name]])) + method_args <- methods[[method_name]] + if (identical(method_name, "lassosum_rss") && + !is.null(variant_info) && + is.null(method_args$variant_info)) { + method_args$variant_info <- variant_info + } + w <- do.call(fn_name, c(list(stat = stat, LD = LD), method_args)) results[[method_name]] <- as.numeric(w) }, error = function(e) { warning(sprintf("Method '%s' failed: %s", method_name, e$message)) diff --git a/R/regularized_regression.R b/R/regularized_regression.R index 2ff27b01..3e7b0b0a 100644 --- a/R/regularized_regression.R +++ b/R/regularized_regression.R @@ -865,58 +865,362 @@ lassosum_rss <- function(bhat, LD, n, result } +.lassosum_cor_from_stat <- function(stat, n, p) { + cor_input <- if (!is.null(stat$cor)) { + as.numeric(stat$cor) + } else if (!is.null(stat$z)) { + as.numeric(stat$z) / sqrt(n) + } else if (!is.null(stat$b)) { + as.numeric(stat$b) + } else { + stop("stat must contain one of 'cor', 'z', or 'b' for lassosum selection.") + } + if (length(cor_input) != p) { + stop("The length of lassosum input statistics (", length(cor_input), + ") must equal nrow(LD) (", p, ").") + } + cor_input +} + +.lassosum_clamp_cor <- function(cor_input) { + max_abs_cor <- max(abs(cor_input), na.rm = TRUE) + if (is.finite(max_abs_cor) && max_abs_cor >= 1) { + cor_input <- cor_input / (max_abs_cor / 0.9999) + } + cor_input +} + +.lassosum_prepare_variant_info <- function(variant_info, expected_n = NULL) { + if (is.null(variant_info)) { + return(NULL) + } + parsed <- parse_variant_id(variant_info) + info <- data.frame( + chrom = as.integer(parsed$chrom), + pos = as.integer(parsed$pos), + A1 = as.character(parsed$A1), + A2 = as.character(parsed$A2), + stringsAsFactors = FALSE + ) + if (!is.null(expected_n) && nrow(info) != expected_n) { + stop("variant_info has ", nrow(info), " rows but expected ", expected_n, ".") + } + info +} + +.lassosum_filter_source_info <- function(info, region = NULL) { + if (is.null(region)) { + return(info) + } + parsed <- parse_region(region) + keep <- strip_chr_prefix(as.character(info$chrom)) == parsed$chrom & + as.integer(info$pos) >= parsed$start & + as.integer(info$pos) <= parsed$end + info[keep, , drop = FALSE] +} + +.lassosum_read_source_info <- function(genotype_source, genotype_type, region = NULL) { + if (genotype_type == "plink1") { + bim <- read_bim(paste0(genotype_source, ".bed")) + info <- data.frame( + id = as.character(bim$id), + chrom = as.character(bim$chrom), + pos = as.integer(bim$pos), + A1 = as.character(bim$a1), + A2 = as.character(bim$a0), + stringsAsFactors = FALSE + ) + } else if (genotype_type == "plink2") { + paths <- resolve_plink2_paths(genotype_source) + pvar <- read_pvar_text(paths$pvar) + info <- data.frame( + id = as.character(pvar$id), + chrom = as.character(pvar$chrom), + pos = as.integer(pvar$pos), + A1 = as.character(pvar$A1), + A2 = as.character(pvar$A2), + stringsAsFactors = FALSE + ) + } else { + stop("Unsupported genotype_type for lassosum source info: ", genotype_type) + } + .lassosum_filter_source_info(info, region = region) +} + +.lassosum_match_variants <- function(variant_info, source_info) { + target <- .lassosum_prepare_variant_info(variant_info) + source <- source_info + if (is.null(target) || !nrow(target) || !nrow(source)) { + return(data.frame()) + } + + target$old_idx <- seq_len(nrow(target)) + source$source_idx <- seq_len(nrow(source)) + source$chrom <- as.integer(strip_chr_prefix(as.character(source$chrom))) + source$pos <- as.integer(source$pos) + source$source_A1 <- as.character(source$A1) + source$source_A2 <- as.character(source$A2) + + same_source <- source[, c("source_idx", "id", "chrom", "pos", "source_A1", "source_A2"), drop = FALSE] + names(same_source) <- c("source_idx", "source_id", "chrom", "pos", "A1", "A2") + same <- merge(target, same_source, by = c("chrom", "pos", "A1", "A2"), all = FALSE) + same$orientation <- 1L + + flip_source <- source[, c("source_idx", "id", "chrom", "pos", "source_A2", "source_A1"), drop = FALSE] + names(flip_source) <- c("source_idx", "source_id", "chrom", "pos", "A1", "A2") + flip <- merge(target, flip_source, by = c("chrom", "pos", "A1", "A2"), all = FALSE) + flip$orientation <- -1L + + matched <- rbind(same, flip) + if (!nrow(matched)) { + return(data.frame()) + } + + matched$orientation_rank <- ifelse(matched$orientation == 1L, 0L, 1L) + matched <- matched[order(matched$old_idx, matched$orientation_rank, matched$source_idx), , drop = FALSE] + matched <- matched[!duplicated(matched$old_idx), , drop = FALSE] + matched <- matched[order(matched$source_idx, matched$orientation_rank, matched$old_idx), , drop = FALSE] + matched <- matched[!duplicated(matched$source_idx), , drop = FALSE] + matched <- matched[order(matched$source_idx), , drop = FALSE] + matched[, c("old_idx", "source_idx", "source_id", "orientation"), drop = FALSE] +} + +.lassosum_first_max <- function(x) { + which(x == max(x, na.rm = TRUE))[1] +} + +.lassosum_select_min_fbeta <- function(candidate_beta, candidate_meta) { + idx <- which.min(candidate_meta$fbeta) + list( + beta = candidate_beta[, idx], + index = idx, + mode = "min_fbeta" + ) +} + +.lassosum_score_candidates_with_bfile <- function(genotype_source, region, variant_info, + candidate_beta, cor_input) { + if (!requireNamespace("lassosum", quietly = TRUE)) { + stop("The 'lassosum' package is required for PLINK1 pseudovalidation.") + } + source_info <- .lassosum_read_source_info(genotype_source, "plink1", region = region) + matched <- .lassosum_match_variants(variant_info, source_info) + if (!nrow(matched)) { + stop("No overlapping variants between variant_info and PLINK1 genotype source.") + } + + beta_oriented <- sweep(candidate_beta[matched$old_idx, , drop = FALSE], 1, matched$orientation, "*") + cor_oriented <- cor_input[matched$old_idx] * matched$orientation + extract_ids <- matched$source_id + sd_vec <- lassosum:::sd.bfile(bfile = genotype_source, extract = extract_ids, trace = 0) + scores <- lassosum:::pseudovalidation( + bfile = genotype_source, + beta = beta_oriented, + cor = cor_oriented, + extract = extract_ids, + sd = sd_vec + ) + scores[is.na(scores)] <- -Inf + idx <- .lassosum_first_max(scores) + list( + beta = candidate_beta[, idx], + index = idx, + mode = "plink1_pseudovalidation" + ) +} + +.lassosum_score_candidates_with_sketch <- function(genotype_source, region, variant_info, + candidate_beta, cor_input) { + sketch <- load_plink2_data(genotype_source, region = region) + source_info <- data.frame( + id = as.character(sketch$variant_info$id), + chrom = as.character(sketch$variant_info$chrom), + pos = as.integer(sketch$variant_info$pos), + A1 = as.character(sketch$variant_info$A1), + A2 = as.character(sketch$variant_info$A2), + stringsAsFactors = FALSE + ) + matched <- .lassosum_match_variants(variant_info, source_info) + if (!nrow(matched)) { + stop("No overlapping variants between variant_info and PLINK2 genotype source.") + } + + X <- sketch$X[, matched$source_idx, drop = FALSE] + X <- mean_impute(X) + beta_oriented <- sweep(candidate_beta[matched$old_idx, , drop = FALSE], 1, matched$orientation, "*") + cor_oriented <- cor_input[matched$old_idx] * matched$orientation + sd_vec <- apply(X, 2, stats::sd) + sd_vec[!is.finite(sd_vec) | sd_vec <= 0] <- Inf + scaled_beta <- sweep(beta_oriented, 1, sd_vec, "/") + scaled_beta[!is.finite(scaled_beta)] <- 0 + pred <- X %*% scaled_beta + pred_centered <- scale(pred, scale = FALSE) + bxxb <- colSums(pred_centered^2) / nrow(pred_centered) + bxy <- as.numeric(crossprod(cor_oriented, beta_oriented)) + scores <- as.numeric(bxy / sqrt(bxxb)) + scores[!is.finite(scores)] <- -Inf + idx <- .lassosum_first_max(scores) + list( + beta = candidate_beta[, idx], + index = idx, + mode = "plink2_sketch_pseudovalidation" + ) +} + +.lassosum_resolve_selection_mode <- function(selection, genotype_source, genotype_type) { + if (selection == "min_fbeta") { + return("min_fbeta") + } + + if (genotype_type == "auto") { + if (is.null(genotype_source) || identical(genotype_source, "")) { + return("min_fbeta") + } + if (has_plink1_files(genotype_source)) { + return("plink1") + } + if (has_plink2_files(genotype_source)) { + return("plink2") + } + return("min_fbeta") + } + + if (genotype_type %in% c("plink1", "plink2")) { + return(genotype_type) + } + + "min_fbeta" +} + #' Extract weights from lassosum_rss with shrinkage grid search #' #' Searches over a grid of shrinkage parameters \code{s} (default: #' \code{c(0.2, 0.5, 0.9, 1.0)}, matching the original lassosum and OTTERS). #' For each \code{s}, the LD matrix is shrunk as \code{(1-s)*R + s*I}, then -#' \code{lassosum_rss()} is called across the lambda path. The best -#' \code{(s, lambda)} combination is selected by the lowest objective value. +#' \code{lassosum_rss()} is called across the lambda path. Candidate selection +#' depends on the available source: +#' \itemize{ +#' \item PLINK1 \code{.bed/.bim/.fam}: published \code{lassosum} pseudovalidation +#' \item PLINK2 sketch \code{.pgen/.pvar/.psam}: sketch-genotype pseudovalidation +#' \item no genotype-backed source: fallback to \code{min(fbeta)} with warning +#' } #' #' @details -#' Model selection uses \code{min(fbeta)} (penalized objective) rather than -#' the pseudovalidation approach from the original lassosum R package. Empirical -#' comparison over 20 random trials (n=300, p=50, 3 causal) shows no systematic -#' advantage for either method: pseudovalidation won 4/20, min(fbeta) won 6/20, -#' tied 10/20. The shrinkage grid over \code{s} provides the primary regularization; -#' lambda selection within each \code{s} has minimal impact. +#' OTTERS compatibility requires source-aware model selection. When a PLINK1 +#' genotype source is available, this wrapper reproduces the old +#' genotype-backed pseudovalidation selector. For PLINK2 sketch inputs, it +#' performs pseudovalidation on the restored \code{U} matrix using +#' \code{U_MIN/U_MAX}. If no genotype-backed source is provided, the function +#' warns and falls back to \code{min(fbeta)}. #' #' @param stat A list with \code{$b} (effect sizes) and \code{$n} (per-variant sample sizes). #' @param LD LD correlation matrix R (single matrix, NOT pre-shrunk). #' @param s Numeric vector of shrinkage parameters to search over. Default: #' \code{c(0.2, 0.5, 0.9, 1.0)} following Mak et al (2017) and OTTERS. +#' @param selection Selection strategy. Default \code{"auto"} uses +#' source-aware pseudovalidation when possible and falls back to +#' \code{"min_fbeta"} otherwise. +#' @param genotype_source Optional PLINK prefix for genotype-backed or +#' sketch-genotype selection. +#' @param genotype_type Source type for \code{genotype_source}: \code{"auto"}, +#' \code{"plink1"}, \code{"plink2"}, or \code{"none"}. +#' @param region Optional region string \code{"chr:start-end"} used to filter +#' genotype source variants. +#' @param variant_info Optional variant annotation aligned to \code{stat} and +#' \code{LD}; must identify \code{chrom}, \code{pos}, \code{A1}, \code{A2}. #' @param ... Additional arguments passed to \code{lassosum_rss()}. #' #' @return A numeric vector of the posterior SNP coefficients at the best (s, lambda). #' @export -lassosum_rss_weights <- function(stat, LD, s = c(0.2, 0.5, 0.9, 1.0), ...) { +lassosum_rss_weights <- function(stat, LD, s = c(0.2, 0.5, 0.9, 1.0), + selection = c("auto", "min_fbeta"), + genotype_source = NULL, + genotype_type = c("auto", "plink1", "plink2", "none"), + region = NULL, + variant_info = NULL, + ...) { + selection <- match.arg(selection) + genotype_type <- match.arg(genotype_type) n <- median(stat$n) p <- nrow(LD) - best_fbeta <- Inf - best_beta <- rep(0, p) - - # Clamp marginal correlations to (-1, 1) as required by lassosum. - # This is lassosum-specific — other methods (PRS-CS, SDPR) handle - # their own regularization and should not be globally rescaled. - # Matches OTTERS shrink_factor logic (PRSmodels/lassosum.R lines 71-77). - bhat <- stat$b - max_abs_b <- max(abs(bhat)) - if (max_abs_b >= 1) { - bhat <- bhat / (max_abs_b / 0.9999) - } + cor_input <- .lassosum_clamp_cor(.lassosum_cor_from_stat(stat, n = n, p = p)) + solver_input <- cor_input * sqrt(n) + candidate_beta <- NULL + candidate_meta <- list() for (s_val in s) { - # Shrink LD: R_s = (1 - s) * R + s * I LD_s <- (1 - s_val) * LD + s_val * diag(p) - model <- lassosum_rss(bhat = bhat, LD = list(blk1 = LD_s), n = n, ...) - min_fbeta <- min(model$fbeta) - if (min_fbeta < best_fbeta) { - best_fbeta <- min_fbeta - best_beta <- model$beta_est + model <- lassosum_rss(bhat = solver_input, LD = list(blk1 = LD_s), n = n, ...) + candidate_beta <- cbind(candidate_beta, model$beta) + candidate_meta[[length(candidate_meta) + 1L]] <- data.frame( + s = rep(s_val, length(model$lambda)), + lambda = model$lambda, + fbeta = model$fbeta, + stringsAsFactors = FALSE + ) + } + candidate_meta <- do.call(rbind, candidate_meta) + + mode <- .lassosum_resolve_selection_mode(selection, genotype_source, genotype_type) + variant_info_prepped <- .lassosum_prepare_variant_info(variant_info, expected_n = p) + selector_result <- NULL + + if (mode == "plink1" || mode == "plink2") { + selector_result <- tryCatch( + { + if (is.null(variant_info_prepped)) { + stop("variant_info is required for genotype-backed lassosum selection.") + } + if (is.null(genotype_source) || identical(genotype_source, "")) { + stop("genotype_source is required for genotype-backed lassosum selection.") + } + if (mode == "plink1") { + .lassosum_score_candidates_with_bfile( + genotype_source = genotype_source, + region = region, + variant_info = variant_info_prepped, + candidate_beta = candidate_beta, + cor_input = cor_input + ) + } else { + .lassosum_score_candidates_with_sketch( + genotype_source = genotype_source, + region = region, + variant_info = variant_info_prepped, + candidate_beta = candidate_beta, + cor_input = cor_input + ) + } + }, + error = function(e) { + warning( + "lassosum_rss_weights could not use source-aware pseudovalidation (", + conditionMessage(e), + "); falling back to min(fbeta), which is not old-OTTERS-compatible." + ) + NULL + } + ) + } + + if (is.null(selector_result)) { + if (mode == "min_fbeta" && selection == "auto") { + warning( + "lassosum_rss_weights has no genotype-backed source for pseudovalidation; ", + "falling back to min(fbeta), which is not old-OTTERS-compatible." + ) } + selector_result <- .lassosum_select_min_fbeta(candidate_beta, candidate_meta) } - return(best_beta) + best_beta <- as.numeric(selector_result$beta) + attr(best_beta, "lassosum_selection") <- c( + mode = selector_result$mode, + index = selector_result$index, + s = candidate_meta$s[selector_result$index], + lambda = candidate_meta$lambda[selector_result$index] + ) + best_beta } #' Compute Weights Using ncvreg with SCAD or MCP Penalty diff --git a/inst/notes/otters_lassosum_selector_fix_pr.md b/inst/notes/otters_lassosum_selector_fix_pr.md new file mode 100644 index 00000000..1e67ce49 --- /dev/null +++ b/inst/notes/otters_lassosum_selector_fix_pr.md @@ -0,0 +1,120 @@ +# OTTERS Lassosum Selector Fix + +## Summary + +This change makes OTTERS lassosum selection source-aware again. + +- PLINK1 `bed/bim/fam` inputs use old genotype-backed `lassosum` pseudovalidation. +- PLINK2 sketch `pgen/pvar/psam` inputs use sketch-genotype pseudovalidation on restored `U`. +- Inputs with no genotype-backed source still fall back to `min(fbeta)`, with an explicit warning that the fallback is not old-OTTERS-compatible. +- The OTTERS wrapper also stops double-scaling lassosum input before it reaches the low-level solver. + +## Example 206 + +Fixture `chr1_206088859_208088859__ENSG00000123843` isolates the regression. + +- old saved vs old direct published `lassosum`: Pearson `1.0`, `0` opposite-sign variants +- corrected-scaling + `min(fbeta)`: Pearson about `0.360`, `1309` opposite-sign variants + +Old published lassosum selected `s=0.2, lambda=1e-4`. Corrected-scaling plus `min(fbeta)` still selected `s=1, lambda=1e-4`. + +Implication: + +- this is not a grid-definition problem +- both candidates are already on the old grid +- the regression comes from changing the selection rule over the same grid + +For this fixture, old pseudovalidation is a degenerate tie case, so old OTTERS selects the first maximum. Replacing that with `min(fbeta)` moves the selected model to the opposite end of the same grid. + +## Example 161 + +Fixture `chr1_161023868_163023868__ENSG00000162745` is the informative selector case. + +On the common matched set of `4298` variants: + +- exact PLINK1 bfile pseudovalidation best candidate: `soft_lambda=0.041050213` +- sketch-genotype pseudovalidation best candidate: `soft_lambda=0.021788613` +- score agreement remains high: + - Pearson `0.9971` + - Spearman `0.9412` + - max absolute difference `0.0474` + +When the sketch path is forced to use `sd_bfile`, the winning candidate returns to `soft_lambda=0.041050213`. + +Implication: + +- this is not a variant-overlap problem +- this is a predictive normalization problem inside pseudovalidation +- restored sketch `U` recovers the score surface closely +- sketch-derived `sd` is not identical to old `lassosum:::sd.bfile()` + +So PLINK1 is the exact old-parity path, while PLINK2 sketch is the right source-aware selector for sketch inputs but remains an approximation relative to old PLINK1 behavior. + +## What Is Fixed + +## `R/regularized_regression.R` + +- `lassosum_rss_weights()` no longer defaults every OTTERS run to `min(fbeta)`. +- It now selects by source: + - PLINK1: published `lassosum` pseudovalidation + - PLINK2 sketch: restored-`U` pseudovalidation + - no genotype-backed source: warning + `min(fbeta)` fallback +- It preserves old first-max tie behavior for pseudovalidation selectors. +- It passes lassosum input in correlation units only once before the low-level solver converts to `z / sqrt(n)`. + +What actually failed: + +- old OTTERS-compatible selection was replaced with `min(fbeta)` +- the OTTERS wrapper also double-scaled lassosum input + +Why it worked earlier: + +- old OTTERS used published `lassosum.pipeline(..., test.bfile = bfile)` followed by `pseudovalidate(out)` + +Why it failed here: + +- the refactor changed the selector and the scaling contract +- fixture `206` exposed the selector regression clearly because it is a tied pseudovalidation table where first-max and `min(fbeta)` pick different candidates + +Classification: + +- selector replacement: real behavioral regression +- double scaling: real implementation bug +- fixture `206` tie case: edge-case trigger that exposed the regression cleanly + +Compatibility: + +- exact old behavior is restored for PLINK1 genotype-backed OTTERS runs +- PLINK2 sketch inputs now use a source-aware selector instead of the generic fallback +- pure precomputed-LD inputs remain on the fallback path and now say so explicitly + +## `R/otters.R` + +- `otters_weights()` now forwards aligned `chrom/pos/A1/A2` variant metadata to lassosum when available. +- This restores the variant-alignment information needed for source-aware selection. + +Classification: + +- wrapper or interface drift + +## Tests + +Focused validation for this change: + +- `tests/testthat/test_rr_lassosum.R` +- `tests/testthat/test_rr_dispatch.R` +- `tests/testthat/test_otters.R` + +Local evidence paths used during debugging: + +- `temp_reference/otters_regression/lassosum_oldR_direct_206/` +- `temp_reference/otters_regression/lassosum_forensics_206/` +- `temp_reference/otters_regression/pseudovalidation_sketch_vs_bfile/` + +## Narrow Compatibility Note + +This change only alters the OTTERS lassosum selection path. + +- It restores exact old behavior when a legacy PLINK1 genotype source is available. +- It makes PLINK2 sketch inputs use a sketch-genotype selector instead of the old generic fallback. +- It does not change PRS-CS or SDPR behavior. diff --git a/tests/testthat/test_otters.R b/tests/testthat/test_otters.R index fe3e4135..d31ce2fd 100644 --- a/tests/testthat/test_otters.R +++ b/tests/testthat/test_otters.R @@ -9,7 +9,7 @@ test_that("otters_weights returns named list of weight vectors", { R <- diag(p) sumstats <- data.frame(z = z) result <- otters_weights(sumstats, R, n, - methods = list(lassosum_rss = list()), + methods = list(lassosum_rss = list(selection = "min_fbeta")), p_thresholds = c(0.05) ) expect_type(result, "list") @@ -26,7 +26,7 @@ test_that("otters_weights computes z from beta/se if z missing", { sumstats <- data.frame(beta = rnorm(p, sd = 0.1), se = rep(0.05, p)) R <- diag(p) result <- otters_weights(sumstats, R, n, - methods = list(lassosum_rss = list()), + methods = list(lassosum_rss = list(selection = "min_fbeta")), p_thresholds = c(0.05) ) expect_true("lassosum_rss" %in% names(result)) @@ -75,7 +75,7 @@ test_that("otters_weights with multiple methods returns all", { sumstats <- data.frame(z = z) result <- otters_weights(sumstats, R, n, methods = list( - lassosum_rss = list(), + lassosum_rss = list(selection = "min_fbeta"), prs_cs = list(n_iter = 50, n_burnin = 10, thin = 2, seed = 42) ), p_thresholds = c(0.001, 0.05) @@ -87,6 +87,48 @@ test_that("otters_weights with multiple methods returns all", { } }) +test_that("otters_weights forwards variant_info only to lassosum", { + p <- 5 + n <- 100 + z <- rnorm(p) + R <- diag(p) + sumstats <- data.frame( + chrom = rep(1, p), + pos = seq_len(p), + A1 = c("A", "C", "G", "T", "A"), + A2 = c("G", "T", "A", "C", "G"), + z = z, + stringsAsFactors = FALSE + ) + captured <- new.env(parent = emptyenv()) + local_mocked_bindings( + lassosum_rss_weights = function(stat, LD, variant_info = NULL, ...) { + captured$lassosum_variant_info <- variant_info + captured$lassosum_dots <- list(...) + rep(0.1, nrow(LD)) + }, + prs_cs_weights = function(stat, LD, ...) { + captured$prs_cs_dots <- list(...) + rep(0.2, nrow(LD)) + } + ) + + result <- otters_weights( + sumstats, R, n, + methods = list( + lassosum_rss = list(selection = "min_fbeta"), + prs_cs = list(phi = 1e-4) + ), + p_thresholds = NULL, + check_ld_method = NULL + ) + + expect_equal(result$lassosum_rss, rep(0.1, p)) + expect_equal(result$prs_cs, rep(0.2, p)) + expect_equal(captured$lassosum_variant_info, sumstats[, c("chrom", "pos", "A1", "A2")]) + expect_null(captured$prs_cs_dots$variant_info) +}) + # ---- otters_association ---- test_that("otters_association returns correct structure", { set.seed(42) @@ -159,7 +201,7 @@ test_that("otters_weights + otters_association end-to-end on simulated data", { # Stage I: train weights sumstats <- data.frame(z = eqtl_z) weights <- otters_weights(sumstats, R, n_eqtl, - methods = list(lassosum_rss = list()), + methods = list(lassosum_rss = list(selection = "min_fbeta")), p_thresholds = c(0.05) ) expect_true(length(weights) >= 2) diff --git a/tests/testthat/test_rr_dispatch.R b/tests/testthat/test_rr_dispatch.R index 40e5f778..a17e44e9 100644 --- a/tests/testthat/test_rr_dispatch.R +++ b/tests/testthat/test_rr_dispatch.R @@ -75,10 +75,14 @@ test_that("lassosum_rss_weights dispatches to lassosum_rss once per s value", { lassosum_rss = function(bhat, LD, n, ...) { call_log$calls <- c(call_log$calls, list(list(bhat = bhat, LD = LD, n = n))) - list(beta_est = rep(0.05, length(bhat)), fbeta = c(1.0, 0.5)) + list( + beta = cbind(rep(0, length(bhat)), rep(0.05, length(bhat))), + lambda = c(0.05, 0.01), + fbeta = c(1.0, 0.5) + ) } ) - lassosum_rss_weights(stat = stat, LD = R, s = c(0.2, 0.9)) + lassosum_rss_weights(stat = stat, LD = R, s = c(0.2, 0.9), selection = "min_fbeta") expect_equal(length(call_log$calls), 2L) expect_equal(call_log$calls[[1]]$n, 100) expect_equal(length(call_log$calls[[1]]$bhat), p) @@ -86,6 +90,110 @@ test_that("lassosum_rss_weights dispatches to lassosum_rss once per s value", { expect_false(identical(call_log$calls[[1]]$LD, call_log$calls[[2]]$LD)) }) +test_that("lassosum_rss_weights auto mode dispatches to PLINK1 pseudovalidation", { + p <- 4 + stat <- list(b = c(0.1, -0.2, 0.05, 0.03), n = rep(100, p)) + R <- diag(p) + variant_info <- data.frame( + chrom = 1, + pos = 1:p, + A1 = c("A", "C", "G", "T"), + A2 = c("G", "T", "A", "C"), + stringsAsFactors = FALSE + ) + + local_mocked_bindings( + has_plink1_files = function(prefix) TRUE, + has_plink2_files = function(prefix) FALSE, + lassosum_rss = function(bhat, LD, n, ...) { + list( + beta = matrix(seq_len(length(bhat) * 2), nrow = length(bhat)), + lambda = c(0.05, 0.01), + fbeta = c(2, 1) + ) + }, + .lassosum_score_candidates_with_bfile = function(...) { + list(beta = rep(0.7, p), index = 2L, mode = "plink1_pseudovalidation") + } + ) + + result <- lassosum_rss_weights( + stat = stat, + LD = R, + s = 0.5, + selection = "auto", + genotype_source = "/fake/plink1", + variant_info = variant_info + ) + + expect_equal(c(result), rep(0.7, p)) + expect_equal(unname(attr(result, "lassosum_selection")["mode"]), "plink1_pseudovalidation") +}) + +test_that("lassosum_rss_weights auto mode dispatches to PLINK2 sketch pseudovalidation", { + p <- 4 + stat <- list(b = c(0.1, -0.2, 0.05, 0.03), n = rep(100, p)) + R <- diag(p) + variant_info <- data.frame( + chrom = 1, + pos = 1:p, + A1 = c("A", "C", "G", "T"), + A2 = c("G", "T", "A", "C"), + stringsAsFactors = FALSE + ) + + local_mocked_bindings( + has_plink1_files = function(prefix) FALSE, + has_plink2_files = function(prefix) TRUE, + lassosum_rss = function(bhat, LD, n, ...) { + list( + beta = matrix(seq_len(length(bhat) * 2), nrow = length(bhat)), + lambda = c(0.05, 0.01), + fbeta = c(2, 1) + ) + }, + .lassosum_score_candidates_with_sketch = function(...) { + list(beta = rep(0.4, p), index = 1L, mode = "plink2_sketch_pseudovalidation") + } + ) + + result <- lassosum_rss_weights( + stat = stat, + LD = R, + s = 0.5, + selection = "auto", + genotype_source = "/fake/plink2", + variant_info = variant_info + ) + + expect_equal(c(result), rep(0.4, p)) + expect_equal(unname(attr(result, "lassosum_selection")["mode"]), "plink2_sketch_pseudovalidation") +}) + +test_that("lassosum_rss_weights warns and falls back to min(fbeta) without genotype source", { + p <- 4 + stat <- list(b = c(0.1, -0.2, 0.05, 0.03), n = rep(100, p)) + R <- diag(p) + expected <- rep(0.2, p) + + local_mocked_bindings( + lassosum_rss = function(bhat, LD, n, ...) { + list( + beta = cbind(rep(0, length(bhat)), expected), + lambda = c(0.05, 0.01), + fbeta = c(5, 1) + ) + } + ) + + result <- expect_warning( + lassosum_rss_weights(stat = stat, LD = R, s = 0.5, selection = "auto"), + "not old-OTTERS-compatible" + ) + expect_equal(c(result), expected) + expect_equal(unname(attr(result, "lassosum_selection")["mode"]), "min_fbeta") +}) + test_that("bayes_{n,l,a,c,r}_weights each dispatch to bayes_alphabet_weights with correct method", { set.seed(42) X <- matrix(rnorm(50), nrow = 10) diff --git a/tests/testthat/test_rr_lassosum.R b/tests/testthat/test_rr_lassosum.R index d620d0fc..fda14943 100644 --- a/tests/testthat/test_rr_lassosum.R +++ b/tests/testthat/test_rr_lassosum.R @@ -76,12 +76,24 @@ test_that("lassosum_rss_weights calls lassosum_rss and returns beta_est", { R[i + 1, i] <- 0.3 } stat <- list(b = bhat, n = rep(n, p)) - result <- lassosum_rss_weights(stat = stat, LD = R) + expected <- seq_len(p) * 0.02 + local_mocked_bindings( + lassosum_rss = function(bhat, LD, n, ...) { + list( + beta = cbind(rep(0, length(bhat)), expected), + lambda = c(0.05, 0.01), + fbeta = c(1, 0.1) + ) + } + ) + result <- lassosum_rss_weights(stat = stat, LD = R, s = 0.5, selection = "min_fbeta") expect_equal(length(result), p) expect_true(is.numeric(result)) + expect_equal(c(result), expected) + expect_equal(unname(attr(result, "lassosum_selection")["mode"]), "min_fbeta") }) -test_that("lassosum_rss_weights rescales bhat when max(abs(bhat)) >= 1", { +test_that("lassosum_rss_weights clamps correlation input before scaling", { set.seed(42) p <- 5 n <- 100 @@ -93,13 +105,17 @@ test_that("lassosum_rss_weights rescales bhat when max(abs(bhat)) >= 1", { local_mocked_bindings( lassosum_rss = function(bhat, LD, n, ...) { captured <<- bhat - list(beta_est = rep(0, length(bhat)), fbeta = 1) + list( + beta = matrix(0, nrow = length(bhat), ncol = 2), + lambda = c(0.05, 0.01), + fbeta = c(1, 0.5) + ) } ) - result <- lassosum_rss_weights(stat = stat, LD = R, s = 0.5) - # Captured bhat should have been rescaled so max abs is just under 1 - expect_true(max(abs(captured)) < 1) - expect_equal(max(abs(captured)), 0.9999, tolerance = 1e-9) + result <- lassosum_rss_weights(stat = stat, LD = R, s = 0.5, selection = "min_fbeta") + scaled_cor <- captured / sqrt(n) + expect_true(max(abs(scaled_cor)) < 1) + expect_equal(max(abs(scaled_cor)), 0.9999, tolerance = 1e-9) # Sign-preserving rescale - expect_true(all(sign(captured) == sign(bhat))) + expect_true(all(sign(scaled_cor) == sign(bhat))) }) From 5aeaaa5390790462ac20e43a285549840329987d Mon Sep 17 00:00:00 2001 From: Hao Sun Date: Thu, 23 Apr 2026 23:40:09 -0400 Subject: [PATCH 2/5] Update lassosum PR note with LD-only selector rationale --- inst/notes/otters_lassosum_selector_fix_pr.md | 81 +++++++++++++++++-- 1 file changed, 75 insertions(+), 6 deletions(-) diff --git a/inst/notes/otters_lassosum_selector_fix_pr.md b/inst/notes/otters_lassosum_selector_fix_pr.md index 1e67ce49..b2df44a0 100644 --- a/inst/notes/otters_lassosum_selector_fix_pr.md +++ b/inst/notes/otters_lassosum_selector_fix_pr.md @@ -2,12 +2,17 @@ ## Summary -This change makes OTTERS lassosum selection source-aware again. +This change fixes the immediate OTTERS lassosum regression, but the updated diagnostics also sharpen the longer-term conclusion: genotype is not fundamentally required for lassosum selection once the selector is written in LD form. -- PLINK1 `bed/bim/fam` inputs use old genotype-backed `lassosum` pseudovalidation. -- PLINK2 sketch `pgen/pvar/psam` inputs use sketch-genotype pseudovalidation on restored `U`. -- Inputs with no genotype-backed source still fall back to `min(fbeta)`, with an explicit warning that the fallback is not old-OTTERS-compatible. +- The regression was caused by replacing old pseudovalidation-based model selection with `min(fbeta)`. +- The compatibility patch in this PR restores old behavior conservatively: + - PLINK1 `bed/bim/fam` inputs use old genotype-backed `lassosum` pseudovalidation. + - PLINK2 sketch `pgen/pvar/psam` inputs use sketch-side pseudovalidation on restored `U`. + - inputs with no genotype-backed source still fall back to `min(fbeta)`, with an explicit warning that the fallback is not old-OTTERS-compatible. - The OTTERS wrapper also stops double-scaling lassosum input before it reaches the low-level solver. +- New diagnostics after the patch show that the selector itself can be expressed as an LD quadratic form: + `score(beta) = (c^T beta) / sqrt(beta^T R beta)`. + On PLINK1-derived LD, that LD-only score reproduces old genotype-backed pseudovalidation almost exactly. ## Example 206 @@ -48,7 +53,63 @@ Implication: - restored sketch `U` recovers the score surface closely - sketch-derived `sd` is not identical to old `lassosum:::sd.bfile()` -So PLINK1 is the exact old-parity path, while PLINK2 sketch is the right source-aware selector for sketch inputs but remains an approximation relative to old PLINK1 behavior. +So PLINK1 is the exact old-parity oracle in this PR, but it is no longer the conceptual end-state. The real conclusion is: + +- genotype is not fundamentally needed for lassosum selection +- the LD quadratic selector is the mathematically correct formulation +- the remaining mismatch is specific to how the sketch-side `R` / scaling is constructed, not to the selector formula itself + +## LD-Only Follow-up + +After the compatibility patch, we ran a direct genotype-vs-LD pseudovalidation diagnostic using the same candidate matrix and the same aligned `cor` vector. + +For each candidate `beta_k`, old pseudovalidation computes: + +```text +scaled_beta = beta / sd +pred = X * scaled_beta +score = (c^T beta) / sqrt(Var(pred)) +``` + +After centering and standardizing the matrix columns by the same `sd`, this becomes: + +```text +score(beta) = (c^T beta) / sqrt(beta^T R beta) +``` + +where `R` is the standardized LD correlation matrix. + +### PLINK1-derived LD + +The LD quadratic score matches genotype-backed pseudovalidation essentially exactly. + +- Fixture `161`: + - genotype best: `soft_lambda=0.041050213`, score `0.3922803981` + - LD-only PLINK1 best: `soft_lambda=0.041050213`, score `0.3922601179` + - Pearson `0.9999999`, same best candidate `TRUE` +- Fixture `206`: + - genotype best: `soft_lambda=0.029906976`, score `0.3895932657` + - LD-only PLINK1 best: `soft_lambda=0.029906976`, score `0.3897370312` + - Pearson `1.0000000`, same best candidate `TRUE` + +This validates the LD-only quadratic selector. Genotype was only one way the old code evaluated the score; it is not inherently required once the standardized `R` is available. + +### Sketch-derived LD + +The same LD-only formulation, +`score(beta) = (c^T beta) / sqrt(beta^T R_sketch beta)`, +does **not** yet reproduce the genotype selector on the informative `161` fixture. + +- Fixture `161`: + - genotype best: `soft_lambda=0.041050213` + - LD-only sketch best: `soft_lambda=0.021788613` + - Pearson `0.9971044`, same best candidate `FALSE` +- Fixture `206`: + - genotype best: `soft_lambda=0.029906976` + - LD-only sketch best: `soft_lambda=0.029906976` + - Pearson `0.9999503`, same best candidate `TRUE` + +So the unresolved problem is no longer the selector formula. It is the sketch-side construction of a standardized `R` that matches the PLINK1/genotype-backed path. ## What Is Fixed @@ -88,6 +149,12 @@ Compatibility: - PLINK2 sketch inputs now use a source-aware selector instead of the generic fallback - pure precomputed-LD inputs remain on the fallback path and now say so explicitly +Important clarification from the new diagnostics: + +- this PR should not be read as claiming genotype is necessary for lassosum +- it should be read as a compatibility patch that restores the old selector while the cleaner LD-only selector is validated +- the LD-only selector is now validated on PLINK1-derived LD; the remaining work is to make the sketch-derived LD path produce the same standardized `R` + ## `R/otters.R` - `otters_weights()` now forwards aligned `chrom/pos/A1/A2` variant metadata to lassosum when available. @@ -116,5 +183,7 @@ Local evidence paths used during debugging: This change only alters the OTTERS lassosum selection path. - It restores exact old behavior when a legacy PLINK1 genotype source is available. -- It makes PLINK2 sketch inputs use a sketch-genotype selector instead of the old generic fallback. +- It makes PLINK2 sketch inputs use a sketch-side selector instead of the old generic fallback. - It does not change PRS-CS or SDPR behavior. + +The new diagnostics imply that a future cleanup should replace the current genotype/sketch split with one format-generic LD-only selector once the sketch-side `R` construction is brought into line with the PLINK1 path. From e94095038bf112f5dd572a1fb0de4642db084bc0 Mon Sep 17 00:00:00 2001 From: Hao Sun Date: Thu, 23 Apr 2026 23:56:41 -0400 Subject: [PATCH 3/5] Use LD-quadratic lassosum selection in OTTERS --- R/otters.R | 28 +- R/regularized_regression.R | 288 ++---------------- inst/notes/otters_lassosum_selector_fix_pr.md | 215 ++++++------- tests/testthat/test_otters.R | 30 +- tests/testthat/test_rr_dispatch.R | 84 ++--- tests/testthat/test_rr_lassosum.R | 4 +- 6 files changed, 151 insertions(+), 498 deletions(-) diff --git a/R/otters.R b/R/otters.R index 8f41ba90..e2823bca 100644 --- a/R/otters.R +++ b/R/otters.R @@ -50,7 +50,7 @@ #' R <- diag(p) #' sumstats <- data.frame(z = z) #' weights <- otters_weights(sumstats, R, n, -#' methods = list(lassosum_rss = list(selection = "min_fbeta")), +#' methods = list(lassosum_rss = list()), #' p_thresholds = c(0.05)) #' #' @export @@ -87,24 +87,6 @@ otters_weights <- function(sumstats, LD, n, p <- nrow(sumstats) z <- sumstats$z - chrom_col <- intersect(c("chrom", "CHROM"), names(sumstats)) - pos_col <- intersect(c("pos", "POS"), names(sumstats)) - a1_col <- intersect(c("A1", "a1"), names(sumstats)) - a2_col <- intersect(c("A2", "a2"), names(sumstats)) - variant_info <- if (length(chrom_col) && - length(pos_col) && - length(a1_col) && - length(a2_col)) { - data.frame( - chrom = sumstats[[chrom_col[[1]]]], - pos = sumstats[[pos_col[[1]]]], - A1 = sumstats[[a1_col[[1]]]], - A2 = sumstats[[a2_col[[1]]]], - stringsAsFactors = FALSE - ) - } else { - NULL - } # Build stat object for _weights() convention b <- z / sqrt(n) @@ -132,13 +114,7 @@ otters_weights <- function(sumstats, LD, n, next } tryCatch({ - method_args <- methods[[method_name]] - if (identical(method_name, "lassosum_rss") && - !is.null(variant_info) && - is.null(method_args$variant_info)) { - method_args$variant_info <- variant_info - } - w <- do.call(fn_name, c(list(stat = stat, LD = LD), method_args)) + w <- do.call(fn_name, c(list(stat = stat, LD = LD), methods[[method_name]])) results[[method_name]] <- as.numeric(w) }, error = function(e) { warning(sprintf("Method '%s' failed: %s", method_name, e$message)) diff --git a/R/regularized_regression.R b/R/regularized_regression.R index 3e7b0b0a..27d4b980 100644 --- a/R/regularized_regression.R +++ b/R/regularized_regression.R @@ -890,101 +890,6 @@ lassosum_rss <- function(bhat, LD, n, cor_input } -.lassosum_prepare_variant_info <- function(variant_info, expected_n = NULL) { - if (is.null(variant_info)) { - return(NULL) - } - parsed <- parse_variant_id(variant_info) - info <- data.frame( - chrom = as.integer(parsed$chrom), - pos = as.integer(parsed$pos), - A1 = as.character(parsed$A1), - A2 = as.character(parsed$A2), - stringsAsFactors = FALSE - ) - if (!is.null(expected_n) && nrow(info) != expected_n) { - stop("variant_info has ", nrow(info), " rows but expected ", expected_n, ".") - } - info -} - -.lassosum_filter_source_info <- function(info, region = NULL) { - if (is.null(region)) { - return(info) - } - parsed <- parse_region(region) - keep <- strip_chr_prefix(as.character(info$chrom)) == parsed$chrom & - as.integer(info$pos) >= parsed$start & - as.integer(info$pos) <= parsed$end - info[keep, , drop = FALSE] -} - -.lassosum_read_source_info <- function(genotype_source, genotype_type, region = NULL) { - if (genotype_type == "plink1") { - bim <- read_bim(paste0(genotype_source, ".bed")) - info <- data.frame( - id = as.character(bim$id), - chrom = as.character(bim$chrom), - pos = as.integer(bim$pos), - A1 = as.character(bim$a1), - A2 = as.character(bim$a0), - stringsAsFactors = FALSE - ) - } else if (genotype_type == "plink2") { - paths <- resolve_plink2_paths(genotype_source) - pvar <- read_pvar_text(paths$pvar) - info <- data.frame( - id = as.character(pvar$id), - chrom = as.character(pvar$chrom), - pos = as.integer(pvar$pos), - A1 = as.character(pvar$A1), - A2 = as.character(pvar$A2), - stringsAsFactors = FALSE - ) - } else { - stop("Unsupported genotype_type for lassosum source info: ", genotype_type) - } - .lassosum_filter_source_info(info, region = region) -} - -.lassosum_match_variants <- function(variant_info, source_info) { - target <- .lassosum_prepare_variant_info(variant_info) - source <- source_info - if (is.null(target) || !nrow(target) || !nrow(source)) { - return(data.frame()) - } - - target$old_idx <- seq_len(nrow(target)) - source$source_idx <- seq_len(nrow(source)) - source$chrom <- as.integer(strip_chr_prefix(as.character(source$chrom))) - source$pos <- as.integer(source$pos) - source$source_A1 <- as.character(source$A1) - source$source_A2 <- as.character(source$A2) - - same_source <- source[, c("source_idx", "id", "chrom", "pos", "source_A1", "source_A2"), drop = FALSE] - names(same_source) <- c("source_idx", "source_id", "chrom", "pos", "A1", "A2") - same <- merge(target, same_source, by = c("chrom", "pos", "A1", "A2"), all = FALSE) - same$orientation <- 1L - - flip_source <- source[, c("source_idx", "id", "chrom", "pos", "source_A2", "source_A1"), drop = FALSE] - names(flip_source) <- c("source_idx", "source_id", "chrom", "pos", "A1", "A2") - flip <- merge(target, flip_source, by = c("chrom", "pos", "A1", "A2"), all = FALSE) - flip$orientation <- -1L - - matched <- rbind(same, flip) - if (!nrow(matched)) { - return(data.frame()) - } - - matched$orientation_rank <- ifelse(matched$orientation == 1L, 0L, 1L) - matched <- matched[order(matched$old_idx, matched$orientation_rank, matched$source_idx), , drop = FALSE] - matched <- matched[!duplicated(matched$old_idx), , drop = FALSE] - matched <- matched[order(matched$source_idx, matched$orientation_rank, matched$old_idx), , drop = FALSE] - matched <- matched[!duplicated(matched$source_idx), , drop = FALSE] - matched <- matched[order(matched$source_idx), , drop = FALSE] - matched[, c("old_idx", "source_idx", "source_id", "orientation"), drop = FALSE] -} - .lassosum_first_max <- function(x) { which(x == max(x, na.rm = TRUE))[1] } @@ -998,149 +903,56 @@ lassosum_rss <- function(bhat, LD, n, ) } -.lassosum_score_candidates_with_bfile <- function(genotype_source, region, variant_info, - candidate_beta, cor_input) { - if (!requireNamespace("lassosum", quietly = TRUE)) { - stop("The 'lassosum' package is required for PLINK1 pseudovalidation.") - } - source_info <- .lassosum_read_source_info(genotype_source, "plink1", region = region) - matched <- .lassosum_match_variants(variant_info, source_info) - if (!nrow(matched)) { - stop("No overlapping variants between variant_info and PLINK1 genotype source.") - } - - beta_oriented <- sweep(candidate_beta[matched$old_idx, , drop = FALSE], 1, matched$orientation, "*") - cor_oriented <- cor_input[matched$old_idx] * matched$orientation - extract_ids <- matched$source_id - sd_vec <- lassosum:::sd.bfile(bfile = genotype_source, extract = extract_ids, trace = 0) - scores <- lassosum:::pseudovalidation( - bfile = genotype_source, - beta = beta_oriented, - cor = cor_oriented, - extract = extract_ids, - sd = sd_vec - ) - scores[is.na(scores)] <- -Inf - idx <- .lassosum_first_max(scores) - list( - beta = candidate_beta[, idx], - index = idx, - mode = "plink1_pseudovalidation" - ) -} - -.lassosum_score_candidates_with_sketch <- function(genotype_source, region, variant_info, - candidate_beta, cor_input) { - sketch <- load_plink2_data(genotype_source, region = region) - source_info <- data.frame( - id = as.character(sketch$variant_info$id), - chrom = as.character(sketch$variant_info$chrom), - pos = as.integer(sketch$variant_info$pos), - A1 = as.character(sketch$variant_info$A1), - A2 = as.character(sketch$variant_info$A2), - stringsAsFactors = FALSE - ) - matched <- .lassosum_match_variants(variant_info, source_info) - if (!nrow(matched)) { - stop("No overlapping variants between variant_info and PLINK2 genotype source.") - } - - X <- sketch$X[, matched$source_idx, drop = FALSE] - X <- mean_impute(X) - beta_oriented <- sweep(candidate_beta[matched$old_idx, , drop = FALSE], 1, matched$orientation, "*") - cor_oriented <- cor_input[matched$old_idx] * matched$orientation - sd_vec <- apply(X, 2, stats::sd) - sd_vec[!is.finite(sd_vec) | sd_vec <= 0] <- Inf - scaled_beta <- sweep(beta_oriented, 1, sd_vec, "/") - scaled_beta[!is.finite(scaled_beta)] <- 0 - pred <- X %*% scaled_beta - pred_centered <- scale(pred, scale = FALSE) - bxxb <- colSums(pred_centered^2) / nrow(pred_centered) - bxy <- as.numeric(crossprod(cor_oriented, beta_oriented)) - scores <- as.numeric(bxy / sqrt(bxxb)) - scores[!is.finite(scores)] <- -Inf +.lassosum_select_ld_quadratic <- function(candidate_beta, cor_input, LD) { + ld_beta <- LD %*% candidate_beta + bxy <- as.numeric(crossprod(cor_input, candidate_beta)) + bxxb <- colSums(candidate_beta * ld_beta) + scores <- rep(-Inf, length(bxy)) + positive <- is.finite(bxxb) & bxxb > 0 + scores[positive] <- bxy[positive] / sqrt(bxxb[positive]) idx <- .lassosum_first_max(scores) list( beta = candidate_beta[, idx], index = idx, - mode = "plink2_sketch_pseudovalidation" + mode = "ld_quadratic" ) } -.lassosum_resolve_selection_mode <- function(selection, genotype_source, genotype_type) { - if (selection == "min_fbeta") { - return("min_fbeta") - } - - if (genotype_type == "auto") { - if (is.null(genotype_source) || identical(genotype_source, "")) { - return("min_fbeta") - } - if (has_plink1_files(genotype_source)) { - return("plink1") - } - if (has_plink2_files(genotype_source)) { - return("plink2") - } - return("min_fbeta") - } - - if (genotype_type %in% c("plink1", "plink2")) { - return(genotype_type) - } - - "min_fbeta" -} - #' Extract weights from lassosum_rss with shrinkage grid search #' #' Searches over a grid of shrinkage parameters \code{s} (default: #' \code{c(0.2, 0.5, 0.9, 1.0)}, matching the original lassosum and OTTERS). #' For each \code{s}, the LD matrix is shrunk as \code{(1-s)*R + s*I}, then #' \code{lassosum_rss()} is called across the lambda path. Candidate selection -#' depends on the available source: -#' \itemize{ -#' \item PLINK1 \code{.bed/.bim/.fam}: published \code{lassosum} pseudovalidation -#' \item PLINK2 sketch \code{.pgen/.pvar/.psam}: sketch-genotype pseudovalidation -#' \item no genotype-backed source: fallback to \code{min(fbeta)} with warning -#' } +#' defaults to the LD-only quadratic pseudovalidation score +#' \deqn{\frac{c^T \beta}{\sqrt{\beta^T R \beta}}} +#' evaluated on the supplied LD matrix \code{R}. This uses the same candidate +#' beta path as \code{lassosum_rss()}, but scores each candidate directly from +#' summary-statistics correlation \code{c} and LD, without requiring genotype. #' #' @details -#' OTTERS compatibility requires source-aware model selection. When a PLINK1 -#' genotype source is available, this wrapper reproduces the old -#' genotype-backed pseudovalidation selector. For PLINK2 sketch inputs, it -#' performs pseudovalidation on the restored \code{U} matrix using -#' \code{U_MIN/U_MAX}. If no genotype-backed source is provided, the function -#' warns and falls back to \code{min(fbeta)}. +#' The original lassosum pseudovalidation can be written as an LD quadratic +#' score after centering and standardizing the reference matrix columns by the +#' same per-variant scale: +#' \deqn{\mathrm{score}(\beta) = \frac{c^T \beta}{\sqrt{\beta^T R \beta}}.} +#' This implementation therefore uses the supplied LD matrix directly for +#' selection. \code{min(fbeta)} is retained only as an explicit debug option. #' #' @param stat A list with \code{$b} (effect sizes) and \code{$n} (per-variant sample sizes). #' @param LD LD correlation matrix R (single matrix, NOT pre-shrunk). #' @param s Numeric vector of shrinkage parameters to search over. Default: #' \code{c(0.2, 0.5, 0.9, 1.0)} following Mak et al (2017) and OTTERS. -#' @param selection Selection strategy. Default \code{"auto"} uses -#' source-aware pseudovalidation when possible and falls back to -#' \code{"min_fbeta"} otherwise. -#' @param genotype_source Optional PLINK prefix for genotype-backed or -#' sketch-genotype selection. -#' @param genotype_type Source type for \code{genotype_source}: \code{"auto"}, -#' \code{"plink1"}, \code{"plink2"}, or \code{"none"}. -#' @param region Optional region string \code{"chr:start-end"} used to filter -#' genotype source variants. -#' @param variant_info Optional variant annotation aligned to \code{stat} and -#' \code{LD}; must identify \code{chrom}, \code{pos}, \code{A1}, \code{A2}. +#' @param selection Selection strategy. Default \code{"ld_quadratic"} uses +#' \eqn{c^T \beta / \sqrt{\beta^T R \beta}} on the supplied LD matrix. +#' \code{"min_fbeta"} is retained as an explicit alternative for debugging. #' @param ... Additional arguments passed to \code{lassosum_rss()}. #' #' @return A numeric vector of the posterior SNP coefficients at the best (s, lambda). #' @export lassosum_rss_weights <- function(stat, LD, s = c(0.2, 0.5, 0.9, 1.0), - selection = c("auto", "min_fbeta"), - genotype_source = NULL, - genotype_type = c("auto", "plink1", "plink2", "none"), - region = NULL, - variant_info = NULL, + selection = c("ld_quadratic", "min_fbeta"), ...) { selection <- match.arg(selection) - genotype_type <- match.arg(genotype_type) n <- median(stat$n) p <- nrow(LD) cor_input <- .lassosum_clamp_cor(.lassosum_cor_from_stat(stat, n = n, p = p)) @@ -1161,56 +973,10 @@ lassosum_rss_weights <- function(stat, LD, s = c(0.2, 0.5, 0.9, 1.0), } candidate_meta <- do.call(rbind, candidate_meta) - mode <- .lassosum_resolve_selection_mode(selection, genotype_source, genotype_type) - variant_info_prepped <- .lassosum_prepare_variant_info(variant_info, expected_n = p) - selector_result <- NULL - - if (mode == "plink1" || mode == "plink2") { - selector_result <- tryCatch( - { - if (is.null(variant_info_prepped)) { - stop("variant_info is required for genotype-backed lassosum selection.") - } - if (is.null(genotype_source) || identical(genotype_source, "")) { - stop("genotype_source is required for genotype-backed lassosum selection.") - } - if (mode == "plink1") { - .lassosum_score_candidates_with_bfile( - genotype_source = genotype_source, - region = region, - variant_info = variant_info_prepped, - candidate_beta = candidate_beta, - cor_input = cor_input - ) - } else { - .lassosum_score_candidates_with_sketch( - genotype_source = genotype_source, - region = region, - variant_info = variant_info_prepped, - candidate_beta = candidate_beta, - cor_input = cor_input - ) - } - }, - error = function(e) { - warning( - "lassosum_rss_weights could not use source-aware pseudovalidation (", - conditionMessage(e), - "); falling back to min(fbeta), which is not old-OTTERS-compatible." - ) - NULL - } - ) - } - - if (is.null(selector_result)) { - if (mode == "min_fbeta" && selection == "auto") { - warning( - "lassosum_rss_weights has no genotype-backed source for pseudovalidation; ", - "falling back to min(fbeta), which is not old-OTTERS-compatible." - ) - } - selector_result <- .lassosum_select_min_fbeta(candidate_beta, candidate_meta) + selector_result <- if (selection == "ld_quadratic") { + .lassosum_select_ld_quadratic(candidate_beta, cor_input, LD) + } else { + .lassosum_select_min_fbeta(candidate_beta, candidate_meta) } best_beta <- as.numeric(selector_result$beta) diff --git a/inst/notes/otters_lassosum_selector_fix_pr.md b/inst/notes/otters_lassosum_selector_fix_pr.md index b2df44a0..c0028688 100644 --- a/inst/notes/otters_lassosum_selector_fix_pr.md +++ b/inst/notes/otters_lassosum_selector_fix_pr.md @@ -1,69 +1,56 @@ -# OTTERS Lassosum Selector Fix +# OTTERS Lassosum LD-Quadratic Selector Fix ## Summary -This change fixes the immediate OTTERS lassosum regression, but the updated diagnostics also sharpen the longer-term conclusion: genotype is not fundamentally required for lassosum selection once the selector is written in LD form. +This PR fixes the OTTERS lassosum regression by removing the old `min(fbeta)` +selector from the default OTTERS path and replacing it with the LD-quadratic +selector -- The regression was caused by replacing old pseudovalidation-based model selection with `min(fbeta)`. -- The compatibility patch in this PR restores old behavior conservatively: - - PLINK1 `bed/bim/fam` inputs use old genotype-backed `lassosum` pseudovalidation. - - PLINK2 sketch `pgen/pvar/psam` inputs use sketch-side pseudovalidation on restored `U`. - - inputs with no genotype-backed source still fall back to `min(fbeta)`, with an explicit warning that the fallback is not old-OTTERS-compatible. -- The OTTERS wrapper also stops double-scaling lassosum input before it reaches the low-level solver. -- New diagnostics after the patch show that the selector itself can be expressed as an LD quadratic form: - `score(beta) = (c^T beta) / sqrt(beta^T R beta)`. - On PLINK1-derived LD, that LD-only score reproduces old genotype-backed pseudovalidation almost exactly. - -## Example 206 - -Fixture `chr1_206088859_208088859__ENSG00000123843` isolates the regression. - -- old saved vs old direct published `lassosum`: Pearson `1.0`, `0` opposite-sign variants -- corrected-scaling + `min(fbeta)`: Pearson about `0.360`, `1309` opposite-sign variants - -Old published lassosum selected `s=0.2, lambda=1e-4`. Corrected-scaling plus `min(fbeta)` still selected `s=1, lambda=1e-4`. +```text +score(beta) = (c^T beta) / sqrt(beta^T R beta) +``` -Implication: +where: -- this is not a grid-definition problem -- both candidates are already on the old grid -- the regression comes from changing the selection rule over the same grid +- `c` is the aligned summary-statistics correlation vector +- `R` is the supplied LD correlation matrix +- `beta` is one candidate on the lassosum `(s, lambda)` path -For this fixture, old pseudovalidation is a degenerate tie case, so old OTTERS selects the first maximum. Replacing that with `min(fbeta)` moves the selected model to the opposite end of the same grid. +The important conclusion from the follow-up diagnostics is that genotype is not +fundamentally required for lassosum selection once the selector is written in +this LD form. The remaining sketch issue is about how `R` is constructed or +standardized, not about the selector formula itself. -## Example 161 +## What Failed -Fixture `chr1_161023868_163023868__ENSG00000162745` is the informative selector case. +Old OTTERS did not select lassosum models by `min(fbeta)`. It fit the beta path +and then used lassosum pseudovalidation to choose the final `(s, lambda)`. -On the common matched set of `4298` variants: +The refactor changed that selector to `min(fbeta)`, and the OTTERS wrapper also +double-scaled the lassosum input before it reached the low-level solver. -- exact PLINK1 bfile pseudovalidation best candidate: `soft_lambda=0.041050213` -- sketch-genotype pseudovalidation best candidate: `soft_lambda=0.021788613` -- score agreement remains high: - - Pearson `0.9971` - - Spearman `0.9412` - - max absolute difference `0.0474` +Those two changes were enough to move the selected model to a very different +part of the same grid. -When the sketch path is forced to use `sd_bfile`, the winning candidate returns to `soft_lambda=0.041050213`. +## Example 206 -Implication: +Fixture `chr1_206088859_208088859__ENSG00000123843` isolates the selector bug. -- this is not a variant-overlap problem -- this is a predictive normalization problem inside pseudovalidation -- restored sketch `U` recovers the score surface closely -- sketch-derived `sd` is not identical to old `lassosum:::sd.bfile()` +- old saved vs old direct published `lassosum`: Pearson `1.0`, `0` opposite-sign variants +- corrected-scaling + `min(fbeta)`: Pearson about `0.360`, `1309` opposite-sign variants -So PLINK1 is the exact old-parity oracle in this PR, but it is no longer the conceptual end-state. The real conclusion is: +On this fixture: -- genotype is not fundamentally needed for lassosum selection -- the LD quadratic selector is the mathematically correct formulation -- the remaining mismatch is specific to how the sketch-side `R` / scaling is constructed, not to the selector formula itself +- published lassosum selected `s = 0.2`, `lambda = 1e-4` +- `min(fbeta)` selected `s = 1`, `lambda = 1e-4` -## LD-Only Follow-up +This is not a grid-definition problem. Both candidates are already on the old +grid. The regression comes from changing the selection rule over the same +candidate path. -After the compatibility patch, we ran a direct genotype-vs-LD pseudovalidation diagnostic using the same candidate matrix and the same aligned `cor` vector. +## Mathematical Rationale -For each candidate `beta_k`, old pseudovalidation computes: +Old pseudovalidation can be written as: ```text scaled_beta = beta / sd @@ -71,119 +58,89 @@ pred = X * scaled_beta score = (c^T beta) / sqrt(Var(pred)) ``` -After centering and standardizing the matrix columns by the same `sd`, this becomes: +After centering and standardizing the columns of `X` by the same per-variant +scale, this becomes: ```text score(beta) = (c^T beta) / sqrt(beta^T R beta) ``` -where `R` is the standardized LD correlation matrix. +So the selector can be evaluated directly from summary-statistics correlation +and LD, without using genotype explicitly. That is the selector implemented in +this PR. + +## Validation + +We ran the LD-quadratic score directly against genotype-backed pseudovalidation +on the same candidate matrix. ### PLINK1-derived LD -The LD quadratic score matches genotype-backed pseudovalidation essentially exactly. +The LD-quadratic score matches genotype pseudovalidation essentially exactly. - Fixture `161`: - - genotype best: `soft_lambda=0.041050213`, score `0.3922803981` - - LD-only PLINK1 best: `soft_lambda=0.041050213`, score `0.3922601179` - - Pearson `0.9999999`, same best candidate `TRUE` + - genotype best: `soft_lambda=0.041050213` + - LD-quadratic PLINK1 best: `soft_lambda=0.041050213` + - Pearson `0.9999999` + - same best candidate `TRUE` - Fixture `206`: - - genotype best: `soft_lambda=0.029906976`, score `0.3895932657` - - LD-only PLINK1 best: `soft_lambda=0.029906976`, score `0.3897370312` - - Pearson `1.0000000`, same best candidate `TRUE` + - genotype best: `soft_lambda=0.029906976` + - LD-quadratic PLINK1 best: `soft_lambda=0.029906976` + - Pearson `1.0000000` + - same best candidate `TRUE` -This validates the LD-only quadratic selector. Genotype was only one way the old code evaluated the score; it is not inherently required once the standardized `R` is available. +This validates the selector formula itself. ### Sketch-derived LD -The same LD-only formulation, -`score(beta) = (c^T beta) / sqrt(beta^T R_sketch beta)`, -does **not** yet reproduce the genotype selector on the informative `161` fixture. +The same LD-quadratic selector applied to the current sketch-derived `R` gives: - Fixture `161`: - - genotype best: `soft_lambda=0.041050213` - - LD-only sketch best: `soft_lambda=0.021788613` - - Pearson `0.9971044`, same best candidate `FALSE` + - sketch-LD quadratic best: `soft_lambda=0.021788613` + - Pearson vs genotype pseudovalidation `0.9971044` + - same best candidate `FALSE` - Fixture `206`: - - genotype best: `soft_lambda=0.029906976` - - LD-only sketch best: `soft_lambda=0.029906976` - - Pearson `0.9999503`, same best candidate `TRUE` + - sketch-LD quadratic best: `soft_lambda=0.029906976` + - Pearson vs genotype pseudovalidation `0.9999503` + - same best candidate `TRUE` -So the unresolved problem is no longer the selector formula. It is the sketch-side construction of a standardized `R` that matches the PLINK1/genotype-backed path. +That result is consistent with the sketch sample-matrix diagnostic: the same +winner shift appears there too. So the remaining mismatch is not between +"sample-matrix pseudovalidation" and "quadratic LD scoring". It is between the +current sketch representation and the PLINK1/genotype-backed standardized LD +path. -## What Is Fixed +## What This PR Changes ## `R/regularized_regression.R` -- `lassosum_rss_weights()` no longer defaults every OTTERS run to `min(fbeta)`. -- It now selects by source: - - PLINK1: published `lassosum` pseudovalidation - - PLINK2 sketch: restored-`U` pseudovalidation - - no genotype-backed source: warning + `min(fbeta)` fallback -- It preserves old first-max tie behavior for pseudovalidation selectors. -- It passes lassosum input in correlation units only once before the low-level solver converts to `z / sqrt(n)`. - -What actually failed: - -- old OTTERS-compatible selection was replaced with `min(fbeta)` -- the OTTERS wrapper also double-scaled lassosum input - -Why it worked earlier: - -- old OTTERS used published `lassosum.pipeline(..., test.bfile = bfile)` followed by `pseudovalidate(out)` - -Why it failed here: - -- the refactor changed the selector and the scaling contract -- fixture `206` exposed the selector regression clearly because it is a tied pseudovalidation table where first-max and `min(fbeta)` pick different candidates - -Classification: - -- selector replacement: real behavioral regression -- double scaling: real implementation bug -- fixture `206` tie case: edge-case trigger that exposed the regression cleanly - -Compatibility: - -- exact old behavior is restored for PLINK1 genotype-backed OTTERS runs -- PLINK2 sketch inputs now use a source-aware selector instead of the generic fallback -- pure precomputed-LD inputs remain on the fallback path and now say so explicitly - -Important clarification from the new diagnostics: - -- this PR should not be read as claiming genotype is necessary for lassosum -- it should be read as a compatibility patch that restores the old selector while the cleaner LD-only selector is validated -- the LD-only selector is now validated on PLINK1-derived LD; the remaining work is to make the sketch-derived LD path produce the same standardized `R` +- fixes the OTTERS lassosum scaling contract so correlation input is only + converted once before the low-level solver +- removes genotype-format-specific selector dispatch from + `lassosum_rss_weights()` +- makes the default selector `ld_quadratic` +- keeps `min(fbeta)` only as an explicit debug option +- preserves first-max tie behavior for equal selector scores ## `R/otters.R` -- `otters_weights()` now forwards aligned `chrom/pos/A1/A2` variant metadata to lassosum when available. -- This restores the variant-alignment information needed for source-aware selection. - -Classification: - -- wrapper or interface drift +- passes correlation-scale statistics into lassosum explicitly via + `stat$cor` / `stat$z` +- removes the temporary genotype-source / variant-metadata plumbing that was + only needed for the earlier compatibility patch -## Tests +## Scope -Focused validation for this change: +This PR only changes the OTTERS lassosum selection path. -- `tests/testthat/test_rr_lassosum.R` -- `tests/testthat/test_rr_dispatch.R` -- `tests/testthat/test_otters.R` +- It does not change PRS-CS or SDPR behavior. +- It does not claim the current sketch-derived LD is already identical to the + PLINK1/genotype-backed path. +- It does claim that the selector should be implemented in LD form, and that + `min(fbeta)` was the wrong default for OTTERS parity. -Local evidence paths used during debugging: +## Evidence Paths - `temp_reference/otters_regression/lassosum_oldR_direct_206/` - `temp_reference/otters_regression/lassosum_forensics_206/` - `temp_reference/otters_regression/pseudovalidation_sketch_vs_bfile/` - -## Narrow Compatibility Note - -This change only alters the OTTERS lassosum selection path. - -- It restores exact old behavior when a legacy PLINK1 genotype source is available. -- It makes PLINK2 sketch inputs use a sketch-side selector instead of the old generic fallback. -- It does not change PRS-CS or SDPR behavior. - -The new diagnostics imply that a future cleanup should replace the current genotype/sketch split with one format-generic LD-only selector once the sketch-side `R` construction is brought into line with the PLINK1 path. diff --git a/tests/testthat/test_otters.R b/tests/testthat/test_otters.R index d31ce2fd..b8e8de3e 100644 --- a/tests/testthat/test_otters.R +++ b/tests/testthat/test_otters.R @@ -9,7 +9,7 @@ test_that("otters_weights returns named list of weight vectors", { R <- diag(p) sumstats <- data.frame(z = z) result <- otters_weights(sumstats, R, n, - methods = list(lassosum_rss = list(selection = "min_fbeta")), + methods = list(lassosum_rss = list()), p_thresholds = c(0.05) ) expect_type(result, "list") @@ -26,7 +26,7 @@ test_that("otters_weights computes z from beta/se if z missing", { sumstats <- data.frame(beta = rnorm(p, sd = 0.1), se = rep(0.05, p)) R <- diag(p) result <- otters_weights(sumstats, R, n, - methods = list(lassosum_rss = list(selection = "min_fbeta")), + methods = list(lassosum_rss = list()), p_thresholds = c(0.05) ) expect_true("lassosum_rss" %in% names(result)) @@ -75,7 +75,7 @@ test_that("otters_weights with multiple methods returns all", { sumstats <- data.frame(z = z) result <- otters_weights(sumstats, R, n, methods = list( - lassosum_rss = list(selection = "min_fbeta"), + lassosum_rss = list(), prs_cs = list(n_iter = 50, n_burnin = 10, thin = 2, seed = 42) ), p_thresholds = c(0.001, 0.05) @@ -87,23 +87,16 @@ test_that("otters_weights with multiple methods returns all", { } }) -test_that("otters_weights forwards variant_info only to lassosum", { +test_that("otters_weights passes correlation-scale stat fields to lassosum", { p <- 5 n <- 100 z <- rnorm(p) R <- diag(p) - sumstats <- data.frame( - chrom = rep(1, p), - pos = seq_len(p), - A1 = c("A", "C", "G", "T", "A"), - A2 = c("G", "T", "A", "C", "G"), - z = z, - stringsAsFactors = FALSE - ) + sumstats <- data.frame(z = z) captured <- new.env(parent = emptyenv()) local_mocked_bindings( - lassosum_rss_weights = function(stat, LD, variant_info = NULL, ...) { - captured$lassosum_variant_info <- variant_info + lassosum_rss_weights = function(stat, LD, ...) { + captured$lassosum_stat <- stat captured$lassosum_dots <- list(...) rep(0.1, nrow(LD)) }, @@ -116,7 +109,7 @@ test_that("otters_weights forwards variant_info only to lassosum", { result <- otters_weights( sumstats, R, n, methods = list( - lassosum_rss = list(selection = "min_fbeta"), + lassosum_rss = list(), prs_cs = list(phi = 1e-4) ), p_thresholds = NULL, @@ -125,8 +118,9 @@ test_that("otters_weights forwards variant_info only to lassosum", { expect_equal(result$lassosum_rss, rep(0.1, p)) expect_equal(result$prs_cs, rep(0.2, p)) - expect_equal(captured$lassosum_variant_info, sumstats[, c("chrom", "pos", "A1", "A2")]) - expect_null(captured$prs_cs_dots$variant_info) + expect_equal(captured$lassosum_stat$cor, z / sqrt(n)) + expect_equal(captured$lassosum_stat$z, z) + expect_equal(captured$lassosum_stat$b, z / sqrt(n)) }) # ---- otters_association ---- @@ -201,7 +195,7 @@ test_that("otters_weights + otters_association end-to-end on simulated data", { # Stage I: train weights sumstats <- data.frame(z = eqtl_z) weights <- otters_weights(sumstats, R, n_eqtl, - methods = list(lassosum_rss = list(selection = "min_fbeta")), + methods = list(lassosum_rss = list()), p_thresholds = c(0.05) ) expect_true(length(weights) >= 2) diff --git a/tests/testthat/test_rr_dispatch.R b/tests/testthat/test_rr_dispatch.R index a17e44e9..2b2a4223 100644 --- a/tests/testthat/test_rr_dispatch.R +++ b/tests/testthat/test_rr_dispatch.R @@ -90,106 +90,66 @@ test_that("lassosum_rss_weights dispatches to lassosum_rss once per s value", { expect_false(identical(call_log$calls[[1]]$LD, call_log$calls[[2]]$LD)) }) -test_that("lassosum_rss_weights auto mode dispatches to PLINK1 pseudovalidation", { +test_that("lassosum_rss_weights defaults to LD-quadratic selection", { p <- 4 - stat <- list(b = c(0.1, -0.2, 0.05, 0.03), n = rep(100, p)) + stat <- list(cor = c(0.4, 0.1, 0, 0), n = rep(100, p)) R <- diag(p) - variant_info <- data.frame( - chrom = 1, - pos = 1:p, - A1 = c("A", "C", "G", "T"), - A2 = c("G", "T", "A", "C"), - stringsAsFactors = FALSE - ) + R[1, 2] <- 0.5 + R[2, 1] <- 0.5 + expected <- c(1, 0, 0, 0) local_mocked_bindings( - has_plink1_files = function(prefix) TRUE, - has_plink2_files = function(prefix) FALSE, lassosum_rss = function(bhat, LD, n, ...) { list( - beta = matrix(seq_len(length(bhat) * 2), nrow = length(bhat)), + beta = cbind(expected, c(1, 1, 0, 0)), lambda = c(0.05, 0.01), - fbeta = c(2, 1) + fbeta = c(5, 1) ) - }, - .lassosum_score_candidates_with_bfile = function(...) { - list(beta = rep(0.7, p), index = 2L, mode = "plink1_pseudovalidation") } ) - result <- lassosum_rss_weights( - stat = stat, - LD = R, - s = 0.5, - selection = "auto", - genotype_source = "/fake/plink1", - variant_info = variant_info - ) - - expect_equal(c(result), rep(0.7, p)) - expect_equal(unname(attr(result, "lassosum_selection")["mode"]), "plink1_pseudovalidation") + result <- lassosum_rss_weights(stat = stat, LD = R, s = 0.5) + expect_equal(c(result), expected) + expect_equal(unname(attr(result, "lassosum_selection")["mode"]), "ld_quadratic") }) -test_that("lassosum_rss_weights auto mode dispatches to PLINK2 sketch pseudovalidation", { - p <- 4 - stat <- list(b = c(0.1, -0.2, 0.05, 0.03), n = rep(100, p)) +test_that("lassosum_rss_weights uses first-max tie behavior for LD-quadratic selection", { + p <- 3 + stat <- list(cor = c(0.3, 0.3, 0), n = rep(100, p)) R <- diag(p) - variant_info <- data.frame( - chrom = 1, - pos = 1:p, - A1 = c("A", "C", "G", "T"), - A2 = c("G", "T", "A", "C"), - stringsAsFactors = FALSE - ) local_mocked_bindings( - has_plink1_files = function(prefix) FALSE, - has_plink2_files = function(prefix) TRUE, lassosum_rss = function(bhat, LD, n, ...) { list( - beta = matrix(seq_len(length(bhat) * 2), nrow = length(bhat)), + beta = cbind(c(1, 0, 0), c(0, 1, 0)), lambda = c(0.05, 0.01), fbeta = c(2, 1) ) - }, - .lassosum_score_candidates_with_sketch = function(...) { - list(beta = rep(0.4, p), index = 1L, mode = "plink2_sketch_pseudovalidation") } ) - result <- lassosum_rss_weights( - stat = stat, - LD = R, - s = 0.5, - selection = "auto", - genotype_source = "/fake/plink2", - variant_info = variant_info - ) - - expect_equal(c(result), rep(0.4, p)) - expect_equal(unname(attr(result, "lassosum_selection")["mode"]), "plink2_sketch_pseudovalidation") + result <- lassosum_rss_weights(stat = stat, LD = R, s = 0.5) + expect_equal(c(result), c(1, 0, 0)) + expect_equal(unname(attr(result, "lassosum_selection")["mode"]), "ld_quadratic") }) -test_that("lassosum_rss_weights warns and falls back to min(fbeta) without genotype source", { +test_that("lassosum_rss_weights still supports explicit min(fbeta)", { p <- 4 - stat <- list(b = c(0.1, -0.2, 0.05, 0.03), n = rep(100, p)) + stat <- list(cor = c(0.4, 0.1, 0, 0), n = rep(100, p)) R <- diag(p) - expected <- rep(0.2, p) + expected <- c(1, 1, 0, 0) local_mocked_bindings( lassosum_rss = function(bhat, LD, n, ...) { list( - beta = cbind(rep(0, length(bhat)), expected), + beta = cbind(c(1, 0, 0, 0), expected), lambda = c(0.05, 0.01), fbeta = c(5, 1) ) } ) - result <- expect_warning( - lassosum_rss_weights(stat = stat, LD = R, s = 0.5, selection = "auto"), - "not old-OTTERS-compatible" - ) + result <- lassosum_rss_weights(stat = stat, LD = R, s = 0.5, selection = "min_fbeta") expect_equal(c(result), expected) expect_equal(unname(attr(result, "lassosum_selection")["mode"]), "min_fbeta") }) diff --git a/tests/testthat/test_rr_lassosum.R b/tests/testthat/test_rr_lassosum.R index fda14943..27f74f5b 100644 --- a/tests/testthat/test_rr_lassosum.R +++ b/tests/testthat/test_rr_lassosum.R @@ -86,11 +86,11 @@ test_that("lassosum_rss_weights calls lassosum_rss and returns beta_est", { ) } ) - result <- lassosum_rss_weights(stat = stat, LD = R, s = 0.5, selection = "min_fbeta") + result <- lassosum_rss_weights(stat = stat, LD = R, s = 0.5) expect_equal(length(result), p) expect_true(is.numeric(result)) expect_equal(c(result), expected) - expect_equal(unname(attr(result, "lassosum_selection")["mode"]), "min_fbeta") + expect_equal(unname(attr(result, "lassosum_selection")["mode"]), "ld_quadratic") }) test_that("lassosum_rss_weights clamps correlation input before scaling", { From e01db0450051b42a0bd98a84faf5fab809dd1705 Mon Sep 17 00:00:00 2001 From: Hao Sun Date: Fri, 24 Apr 2026 00:00:49 -0400 Subject: [PATCH 4/5] Clarify source-matched lassosum validation note --- inst/notes/otters_lassosum_selector_fix_pr.md | 41 +++++++++---------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/inst/notes/otters_lassosum_selector_fix_pr.md b/inst/notes/otters_lassosum_selector_fix_pr.md index c0028688..5b6bb209 100644 --- a/inst/notes/otters_lassosum_selector_fix_pr.md +++ b/inst/notes/otters_lassosum_selector_fix_pr.md @@ -71,44 +71,43 @@ this PR. ## Validation -We ran the LD-quadratic score directly against genotype-backed pseudovalidation -on the same candidate matrix. +We validated the LD-quadratic score against the corresponding source-matched +sample-matrix pseudovalidation on the same candidate matrix. -### PLINK1-derived LD +### PLINK1 source: genotype matrix vs LD-quadratic -The LD-quadratic score matches genotype pseudovalidation essentially exactly. +The LD-quadratic score from PLINK1-derived LD matches PLINK1 genotype +pseudovalidation essentially exactly. - Fixture `161`: - - genotype best: `soft_lambda=0.041050213` - - LD-quadratic PLINK1 best: `soft_lambda=0.041050213` + - PLINK1 genotype best: `soft_lambda=0.041050213` + - PLINK1 LD-quadratic best: `soft_lambda=0.041050213` - Pearson `0.9999999` - same best candidate `TRUE` - Fixture `206`: - - genotype best: `soft_lambda=0.029906976` - - LD-quadratic PLINK1 best: `soft_lambda=0.029906976` + - PLINK1 genotype best: `soft_lambda=0.029906976` + - PLINK1 LD-quadratic best: `soft_lambda=0.029906976` - Pearson `1.0000000` - same best candidate `TRUE` This validates the selector formula itself. -### Sketch-derived LD +### Sketch source: sample matrix vs LD-quadratic -The same LD-quadratic selector applied to the current sketch-derived `R` gives: +For the sketch source, the sample-matrix pseudovalidation and the LD-quadratic +score are the same numeric object once both are built from the same restored +sketch matrix and the same column standardization. - Fixture `161`: - - sketch-LD quadratic best: `soft_lambda=0.021788613` - - Pearson vs genotype pseudovalidation `0.9971044` - - same best candidate `FALSE` -- Fixture `206`: - - sketch-LD quadratic best: `soft_lambda=0.029906976` - - Pearson vs genotype pseudovalidation `0.9999503` + - sketch sample-matrix best: `soft_lambda=0.021788613` + - sketch LD-quadratic best: `soft_lambda=0.021788613` + - Pearson `1.0` + - max absolute difference `< 1e-15` - same best candidate `TRUE` -That result is consistent with the sketch sample-matrix diagnostic: the same -winner shift appears there too. So the remaining mismatch is not between -"sample-matrix pseudovalidation" and "quadratic LD scoring". It is between the -current sketch representation and the PLINK1/genotype-backed standardized LD -path. +So the remaining mismatch is not between "sample-matrix pseudovalidation" and +"quadratic LD scoring". It is between the current sketch representation and the +PLINK1/genotype-backed standardized LD path. ## What This PR Changes From 9e93ee9a50439c066e5095245f54149c7a22a882 Mon Sep 17 00:00:00 2001 From: danielnachun Date: Fri, 24 Apr 2026 20:55:01 +0000 Subject: [PATCH 5/5] Update documentation --- man/lassosum_rss_weights.Rd | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/man/lassosum_rss_weights.Rd b/man/lassosum_rss_weights.Rd index 9bb71070..2d1cc2a3 100644 --- a/man/lassosum_rss_weights.Rd +++ b/man/lassosum_rss_weights.Rd @@ -4,7 +4,13 @@ \alias{lassosum_rss_weights} \title{Extract weights from lassosum_rss with shrinkage grid search} \usage{ -lassosum_rss_weights(stat, LD, s = c(0.2, 0.5, 0.9, 1), ...) +lassosum_rss_weights( + stat, + LD, + s = c(0.2, 0.5, 0.9, 1), + selection = c("ld_quadratic", "min_fbeta"), + ... +) } \arguments{ \item{stat}{A list with \code{$b} (effect sizes) and \code{$n} (per-variant sample sizes).} @@ -14,6 +20,10 @@ lassosum_rss_weights(stat, LD, s = c(0.2, 0.5, 0.9, 1), ...) \item{s}{Numeric vector of shrinkage parameters to search over. Default: \code{c(0.2, 0.5, 0.9, 1.0)} following Mak et al (2017) and OTTERS.} +\item{selection}{Selection strategy. Default \code{"ld_quadratic"} uses +\eqn{c^T \beta / \sqrt{\beta^T R \beta}} on the supplied LD matrix. +\code{"min_fbeta"} is retained as an explicit alternative for debugging.} + \item{...}{Additional arguments passed to \code{lassosum_rss()}.} } \value{ @@ -23,14 +33,18 @@ A numeric vector of the posterior SNP coefficients at the best (s, lambda). Searches over a grid of shrinkage parameters \code{s} (default: \code{c(0.2, 0.5, 0.9, 1.0)}, matching the original lassosum and OTTERS). For each \code{s}, the LD matrix is shrunk as \code{(1-s)*R + s*I}, then -\code{lassosum_rss()} is called across the lambda path. The best -\code{(s, lambda)} combination is selected by the lowest objective value. +\code{lassosum_rss()} is called across the lambda path. Candidate selection +defaults to the LD-only quadratic pseudovalidation score +\deqn{\frac{c^T \beta}{\sqrt{\beta^T R \beta}}} +evaluated on the supplied LD matrix \code{R}. This uses the same candidate +beta path as \code{lassosum_rss()}, but scores each candidate directly from +summary-statistics correlation \code{c} and LD, without requiring genotype. } \details{ -Model selection uses \code{min(fbeta)} (penalized objective) rather than -the pseudovalidation approach from the original lassosum R package. Empirical -comparison over 20 random trials (n=300, p=50, 3 causal) shows no systematic -advantage for either method: pseudovalidation won 4/20, min(fbeta) won 6/20, -tied 10/20. The shrinkage grid over \code{s} provides the primary regularization; -lambda selection within each \code{s} has minimal impact. +The original lassosum pseudovalidation can be written as an LD quadratic +score after centering and standardizing the reference matrix columns by the +same per-variant scale: +\deqn{\mathrm{score}(\beta) = \frac{c^T \beta}{\sqrt{\beta^T R \beta}}.} +This implementation therefore uses the supplied LD matrix directly for +selection. \code{min(fbeta)} is retained only as an explicit debug option. }