StatFunGen · danielnachun · Apr 24, 2026 · Apr 23, 2026 · Apr 24, 2026 · Apr 24, 2026
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -51,6 +51,7 @@ Suggests:
     glmnet,
     harmonicmeanp,
     knitr,
+    lassosum,
     mashr,
     mr.mashr,
     mvsusieR,

diff --git a/R/otters.R b/R/otters.R
@@ -90,7 +90,7 @@ otters_weights <- function(sumstats, LD, n,
 
   # Build stat object for _weights() convention
   b <- z / sqrt(n)
-  stat <- list(b = b, n = rep(n, p))
+  stat <- list(b = b, cor = b, z = z, n = rep(n, p))
 
   results <- list()
 

diff --git a/R/regularized_regression.R b/R/regularized_regression.R
@@ -865,58 +865,128 @@ lassosum_rss <- function(bhat, LD, n,
   result
 }
 
+.lassosum_cor_from_stat <- function(stat, n, p) {
+  cor_input <- if (!is.null(stat$cor)) {
+    as.numeric(stat$cor)
+  } else if (!is.null(stat$z)) {
+    as.numeric(stat$z) / sqrt(n)
+  } else if (!is.null(stat$b)) {
+    as.numeric(stat$b)
+  } else {
+    stop("stat must contain one of 'cor', 'z', or 'b' for lassosum selection.")
+  }
+  if (length(cor_input) != p) {
+    stop("The length of lassosum input statistics (", length(cor_input),
+         ") must equal nrow(LD) (", p, ").")
+  }
+  cor_input
+}
+
+.lassosum_clamp_cor <- function(cor_input) {
+  max_abs_cor <- max(abs(cor_input), na.rm = TRUE)
+  if (is.finite(max_abs_cor) && max_abs_cor >= 1) {
+    cor_input <- cor_input / (max_abs_cor / 0.9999)
+  }
+  cor_input
+}
+
+.lassosum_first_max <- function(x) {
+  which(x == max(x, na.rm = TRUE))[1]
+}
+
+.lassosum_select_min_fbeta <- function(candidate_beta, candidate_meta) {
+  idx <- which.min(candidate_meta$fbeta)
+  list(
+    beta = candidate_beta[, idx],
+    index = idx,
+    mode = "min_fbeta"
+  )
+}
+
+.lassosum_select_ld_quadratic <- function(candidate_beta, cor_input, LD) {
+  ld_beta <- LD %*% candidate_beta
+  bxy <- as.numeric(crossprod(cor_input, candidate_beta))
+  bxxb <- colSums(candidate_beta * ld_beta)
+  scores <- rep(-Inf, length(bxy))
+  positive <- is.finite(bxxb) & bxxb > 0
+  scores[positive] <- bxy[positive] / sqrt(bxxb[positive])
+  idx <- .lassosum_first_max(scores)
+  list(
+    beta = candidate_beta[, idx],
+    index = idx,
+    mode = "ld_quadratic"
+  )
+}
+
 #' Extract weights from lassosum_rss with shrinkage grid search
 #'
 #' Searches over a grid of shrinkage parameters \code{s} (default:
 #' \code{c(0.2, 0.5, 0.9, 1.0)}, matching the original lassosum and OTTERS).
 #' For each \code{s}, the LD matrix is shrunk as \code{(1-s)*R + s*I}, then
-#' \code{lassosum_rss()} is called across the lambda path. The best
-#' \code{(s, lambda)} combination is selected by the lowest objective value.
+#' \code{lassosum_rss()} is called across the lambda path. Candidate selection
+#' defaults to the LD-only quadratic pseudovalidation score
+#' \deqn{\frac{c^T \beta}{\sqrt{\beta^T R \beta}}}
+#' evaluated on the supplied LD matrix \code{R}. This uses the same candidate
+#' beta path as \code{lassosum_rss()}, but scores each candidate directly from
+#' summary-statistics correlation \code{c} and LD, without requiring genotype.
 #'
 #' @details
-#' Model selection uses \code{min(fbeta)} (penalized objective) rather than
-#' the pseudovalidation approach from the original lassosum R package. Empirical
-#' comparison over 20 random trials (n=300, p=50, 3 causal) shows no systematic
-#' advantage for either method: pseudovalidation won 4/20, min(fbeta) won 6/20,
-#' tied 10/20. The shrinkage grid over \code{s} provides the primary regularization;
-#' lambda selection within each \code{s} has minimal impact.
+#' The original lassosum pseudovalidation can be written as an LD quadratic
+#' score after centering and standardizing the reference matrix columns by the
+#' same per-variant scale:
+#' \deqn{\mathrm{score}(\beta) = \frac{c^T \beta}{\sqrt{\beta^T R \beta}}.}
+#' This implementation therefore uses the supplied LD matrix directly for
+#' selection. \code{min(fbeta)} is retained only as an explicit debug option.
 #'
 #' @param stat A list with \code{$b} (effect sizes) and \code{$n} (per-variant sample sizes).
 #' @param LD LD correlation matrix R (single matrix, NOT pre-shrunk).
 #' @param s Numeric vector of shrinkage parameters to search over. Default:
 #'   \code{c(0.2, 0.5, 0.9, 1.0)} following Mak et al (2017) and OTTERS.
+#' @param selection Selection strategy. Default \code{"ld_quadratic"} uses
+#'   \eqn{c^T \beta / \sqrt{\beta^T R \beta}} on the supplied LD matrix.
+#'   \code{"min_fbeta"} is retained as an explicit alternative for debugging.
 #' @param ... Additional arguments passed to \code{lassosum_rss()}.
 #'
 #' @return A numeric vector of the posterior SNP coefficients at the best (s, lambda).
 #' @export
-lassosum_rss_weights <- function(stat, LD, s = c(0.2, 0.5, 0.9, 1.0), ...) {
+lassosum_rss_weights <- function(stat, LD, s = c(0.2, 0.5, 0.9, 1.0),
+                                 selection = c("ld_quadratic", "min_fbeta"),
+                                 ...) {
+  selection <- match.arg(selection)
   n <- median(stat$n)
   p <- nrow(LD)
-  best_fbeta <- Inf
-  best_beta  <- rep(0, p)
-
-  # Clamp marginal correlations to (-1, 1) as required by lassosum.
-  # This is lassosum-specific — other methods (PRS-CS, SDPR) handle
-  # their own regularization and should not be globally rescaled.
-  # Matches OTTERS shrink_factor logic (PRSmodels/lassosum.R lines 71-77).
-  bhat <- stat$b
-  max_abs_b <- max(abs(bhat))
-  if (max_abs_b >= 1) {
-    bhat <- bhat / (max_abs_b / 0.9999)
-  }
+  cor_input <- .lassosum_clamp_cor(.lassosum_cor_from_stat(stat, n = n, p = p))
+  solver_input <- cor_input * sqrt(n)
+  candidate_beta <- NULL
+  candidate_meta <- list()
 
   for (s_val in s) {
-    # Shrink LD: R_s = (1 - s) * R + s * I
     LD_s <- (1 - s_val) * LD + s_val * diag(p)
-    model <- lassosum_rss(bhat = bhat, LD = list(blk1 = LD_s), n = n, ...)
-    min_fbeta <- min(model$fbeta)
-    if (min_fbeta < best_fbeta) {
-      best_fbeta <- min_fbeta
-      best_beta  <- model$beta_est
-    }
+    model <- lassosum_rss(bhat = solver_input, LD = list(blk1 = LD_s), n = n, ...)
+    candidate_beta <- cbind(candidate_beta, model$beta)
+    candidate_meta[[length(candidate_meta) + 1L]] <- data.frame(
+      s = rep(s_val, length(model$lambda)),
+      lambda = model$lambda,
+      fbeta = model$fbeta,
+      stringsAsFactors = FALSE
+    )
+  }
+  candidate_meta <- do.call(rbind, candidate_meta)
+
+  selector_result <- if (selection == "ld_quadratic") {
+    .lassosum_select_ld_quadratic(candidate_beta, cor_input, LD)
+  } else {
+    .lassosum_select_min_fbeta(candidate_beta, candidate_meta)
   }
 
-  return(best_beta)
+  best_beta <- as.numeric(selector_result$beta)
+  attr(best_beta, "lassosum_selection") <- c(
+    mode = selector_result$mode,
+    index = selector_result$index,
+    s = candidate_meta$s[selector_result$index],
+    lambda = candidate_meta$lambda[selector_result$index]
+  )
+  best_beta
 }
 
 #' Compute Weights Using ncvreg with SCAD or MCP Penalty

diff --git a/inst/notes/otters_lassosum_selector_fix_pr.md b/inst/notes/otters_lassosum_selector_fix_pr.md
@@ -0,0 +1,145 @@
+# OTTERS Lassosum LD-Quadratic Selector Fix
+
+## Summary
+
+This PR fixes the OTTERS lassosum regression by removing the old `min(fbeta)`
+selector from the default OTTERS path and replacing it with the LD-quadratic
+selector
+
+```text
+score(beta) = (c^T beta) / sqrt(beta^T R beta)
+```
+
+where:
+
+- `c` is the aligned summary-statistics correlation vector
+- `R` is the supplied LD correlation matrix
+- `beta` is one candidate on the lassosum `(s, lambda)` path
+
+The important conclusion from the follow-up diagnostics is that genotype is not
+fundamentally required for lassosum selection once the selector is written in
+this LD form. The remaining sketch issue is about how `R` is constructed or
+standardized, not about the selector formula itself.
+
+## What Failed
+
+Old OTTERS did not select lassosum models by `min(fbeta)`. It fit the beta path
+and then used lassosum pseudovalidation to choose the final `(s, lambda)`.
+
+The refactor changed that selector to `min(fbeta)`, and the OTTERS wrapper also
+double-scaled the lassosum input before it reached the low-level solver.
+
+Those two changes were enough to move the selected model to a very different
+part of the same grid.
+
+## Example 206
+
+Fixture `chr1_206088859_208088859__ENSG00000123843` isolates the selector bug.
+
+- old saved vs old direct published `lassosum`: Pearson `1.0`, `0` opposite-sign variants
+- corrected-scaling + `min(fbeta)`: Pearson about `0.360`, `1309` opposite-sign variants
+
+On this fixture:
+
+- published lassosum selected `s = 0.2`, `lambda = 1e-4`
+- `min(fbeta)` selected `s = 1`, `lambda = 1e-4`
+
+This is not a grid-definition problem. Both candidates are already on the old
+grid. The regression comes from changing the selection rule over the same
+candidate path.
+
+## Mathematical Rationale
+
+Old pseudovalidation can be written as:
+
+```text
+scaled_beta = beta / sd
+pred        = X * scaled_beta
+score       = (c^T beta) / sqrt(Var(pred))
+```
+
+After centering and standardizing the columns of `X` by the same per-variant
+scale, this becomes:
+
+```text
+score(beta) = (c^T beta) / sqrt(beta^T R beta)
+```
+
+So the selector can be evaluated directly from summary-statistics correlation
+and LD, without using genotype explicitly. That is the selector implemented in
+this PR.
+
+## Validation
+
+We validated the LD-quadratic score against the corresponding source-matched
+sample-matrix pseudovalidation on the same candidate matrix.
+
+### PLINK1 source: genotype matrix vs LD-quadratic
+
+The LD-quadratic score from PLINK1-derived LD matches PLINK1 genotype
+pseudovalidation essentially exactly.
+
+- Fixture `161`:
+  - PLINK1 genotype best: `soft_lambda=0.041050213`
+  - PLINK1 LD-quadratic best: `soft_lambda=0.041050213`
+  - Pearson `0.9999999`
+  - same best candidate `TRUE`
+- Fixture `206`:
+  - PLINK1 genotype best: `soft_lambda=0.029906976`
+  - PLINK1 LD-quadratic best: `soft_lambda=0.029906976`
+  - Pearson `1.0000000`
+  - same best candidate `TRUE`
+
+This validates the selector formula itself.
+
+### Sketch source: sample matrix vs LD-quadratic
+
+For the sketch source, the sample-matrix pseudovalidation and the LD-quadratic
+score are the same numeric object once both are built from the same restored
+sketch matrix and the same column standardization.
+
+- Fixture `161`:
+  - sketch sample-matrix best: `soft_lambda=0.021788613`
+  - sketch LD-quadratic best: `soft_lambda=0.021788613`
+  - Pearson `1.0`
+  - max absolute difference `< 1e-15`
+  - same best candidate `TRUE`
+
+So the remaining mismatch is not between "sample-matrix pseudovalidation" and
+"quadratic LD scoring". It is between the current sketch representation and the
+PLINK1/genotype-backed standardized LD path.
+
+## What This PR Changes
+
+## `R/regularized_regression.R`
+
+- fixes the OTTERS lassosum scaling contract so correlation input is only
+  converted once before the low-level solver
+- removes genotype-format-specific selector dispatch from
+  `lassosum_rss_weights()`
+- makes the default selector `ld_quadratic`
+- keeps `min(fbeta)` only as an explicit debug option
+- preserves first-max tie behavior for equal selector scores
+
+## `R/otters.R`
+
+- passes correlation-scale statistics into lassosum explicitly via
+  `stat$cor` / `stat$z`
+- removes the temporary genotype-source / variant-metadata plumbing that was
+  only needed for the earlier compatibility patch
+
+## Scope
+
+This PR only changes the OTTERS lassosum selection path.
+
+- It does not change PRS-CS or SDPR behavior.
+- It does not claim the current sketch-derived LD is already identical to the
+  PLINK1/genotype-backed path.
+- It does claim that the selector should be implemented in LD form, and that
+  `min(fbeta)` was the wrong default for OTTERS parity.
+
+## Evidence Paths
+
+- `temp_reference/otters_regression/lassosum_oldR_direct_206/`
+- `temp_reference/otters_regression/lassosum_forensics_206/`
+- `temp_reference/otters_regression/pseudovalidation_sketch_vs_bfile/`
diff --git a/man/lassosum_rss_weights.Rd b/man/lassosum_rss_weights.Rd