From 59f7e050e404e60c5e723351df5d0f12c73e5c66 Mon Sep 17 00:00:00 2001
From: Hao Sun <sunh14@lc06e01.chimera.hpc.mssm.edu>
Date: Thu, 23 Apr 2026 15:55:47 -0400
Subject: [PATCH 1/5] Fix OTTERS lassosum selector parity

---
 DESCRIPTION                                   |   1 +
 R/otters.R                                    |  30 +-
 R/regularized_regression.R                    | 360 ++++++++++++++++--
 inst/notes/otters_lassosum_selector_fix_pr.md | 120 ++++++
 tests/testthat/test_otters.R                  |  50 ++-
 tests/testthat/test_rr_dispatch.R             | 112 +++++-
 tests/testthat/test_rr_lassosum.R             |  32 +-
 7 files changed, 660 insertions(+), 45 deletions(-)
 create mode 100644 inst/notes/otters_lassosum_selector_fix_pr.md

diff --git a/DESCRIPTION b/DESCRIPTION
index f460e17c..5e858b8c 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -51,6 +51,7 @@ Suggests:
     glmnet,
     harmonicmeanp,
     knitr,
+    lassosum,
     mashr,
     mr.mashr,
     mvsusieR,
diff --git a/R/otters.R b/R/otters.R
index 142cd469..8f41ba90 100644
--- a/R/otters.R
+++ b/R/otters.R
@@ -50,7 +50,7 @@
 #' R <- diag(p)
 #' sumstats <- data.frame(z = z)
 #' weights <- otters_weights(sumstats, R, n,
-#'   methods = list(lassosum_rss = list()),
+#'   methods = list(lassosum_rss = list(selection = "min_fbeta")),
 #'   p_thresholds = c(0.05))
 #'
 #' @export
@@ -87,10 +87,28 @@ otters_weights <- function(sumstats, LD, n,
 
   p <- nrow(sumstats)
   z <- sumstats$z
+  chrom_col <- intersect(c("chrom", "CHROM"), names(sumstats))
+  pos_col <- intersect(c("pos", "POS"), names(sumstats))
+  a1_col <- intersect(c("A1", "a1"), names(sumstats))
+  a2_col <- intersect(c("A2", "a2"), names(sumstats))
+  variant_info <- if (length(chrom_col) &&
+                      length(pos_col) &&
+                      length(a1_col) &&
+                      length(a2_col)) {
+    data.frame(
+      chrom = sumstats[[chrom_col[[1]]]],
+      pos = sumstats[[pos_col[[1]]]],
+      A1 = sumstats[[a1_col[[1]]]],
+      A2 = sumstats[[a2_col[[1]]]],
+      stringsAsFactors = FALSE
+    )
+  } else {
+    NULL
+  }
 
   # Build stat object for _weights() convention
   b <- z / sqrt(n)
-  stat <- list(b = b, n = rep(n, p))
+  stat <- list(b = b, cor = b, z = z, n = rep(n, p))
 
   results <- list()
 
@@ -114,7 +132,13 @@ otters_weights <- function(sumstats, LD, n,
       next
     }
     tryCatch({
-      w <- do.call(fn_name, c(list(stat = stat, LD = LD), methods[[method_name]]))
+      method_args <- methods[[method_name]]
+      if (identical(method_name, "lassosum_rss") &&
+          !is.null(variant_info) &&
+          is.null(method_args$variant_info)) {
+        method_args$variant_info <- variant_info
+      }
+      w <- do.call(fn_name, c(list(stat = stat, LD = LD), method_args))
       results[[method_name]] <- as.numeric(w)
     }, error = function(e) {
       warning(sprintf("Method '%s' failed: %s", method_name, e$message))
diff --git a/R/regularized_regression.R b/R/regularized_regression.R
index 2ff27b01..3e7b0b0a 100644
--- a/R/regularized_regression.R
+++ b/R/regularized_regression.R
@@ -865,58 +865,362 @@ lassosum_rss <- function(bhat, LD, n,
   result
 }
 
+.lassosum_cor_from_stat <- function(stat, n, p) {
+  cor_input <- if (!is.null(stat$cor)) {
+    as.numeric(stat$cor)
+  } else if (!is.null(stat$z)) {
+    as.numeric(stat$z) / sqrt(n)
+  } else if (!is.null(stat$b)) {
+    as.numeric(stat$b)
+  } else {
+    stop("stat must contain one of 'cor', 'z', or 'b' for lassosum selection.")
+  }
+  if (length(cor_input) != p) {
+    stop("The length of lassosum input statistics (", length(cor_input),
+         ") must equal nrow(LD) (", p, ").")
+  }
+  cor_input
+}
+
+.lassosum_clamp_cor <- function(cor_input) {
+  max_abs_cor <- max(abs(cor_input), na.rm = TRUE)
+  if (is.finite(max_abs_cor) && max_abs_cor >= 1) {
+    cor_input <- cor_input / (max_abs_cor / 0.9999)
+  }
+  cor_input
+}
+
+.lassosum_prepare_variant_info <- function(variant_info, expected_n = NULL) {
+  if (is.null(variant_info)) {
+    return(NULL)
+  }
+  parsed <- parse_variant_id(variant_info)
+  info <- data.frame(
+    chrom = as.integer(parsed$chrom),
+    pos = as.integer(parsed$pos),
+    A1 = as.character(parsed$A1),
+    A2 = as.character(parsed$A2),
+    stringsAsFactors = FALSE
+  )
+  if (!is.null(expected_n) && nrow(info) != expected_n) {
+    stop("variant_info has ", nrow(info), " rows but expected ", expected_n, ".")
+  }
+  info
+}
+
+.lassosum_filter_source_info <- function(info, region = NULL) {
+  if (is.null(region)) {
+    return(info)
+  }
+  parsed <- parse_region(region)
+  keep <- strip_chr_prefix(as.character(info$chrom)) == parsed$chrom &
+    as.integer(info$pos) >= parsed$start &
+    as.integer(info$pos) <= parsed$end
+  info[keep, , drop = FALSE]
+}
+
+.lassosum_read_source_info <- function(genotype_source, genotype_type, region = NULL) {
+  if (genotype_type == "plink1") {
+    bim <- read_bim(paste0(genotype_source, ".bed"))
+    info <- data.frame(
+      id = as.character(bim$id),
+      chrom = as.character(bim$chrom),
+      pos = as.integer(bim$pos),
+      A1 = as.character(bim$a1),
+      A2 = as.character(bim$a0),
+      stringsAsFactors = FALSE
+    )
+  } else if (genotype_type == "plink2") {
+    paths <- resolve_plink2_paths(genotype_source)
+    pvar <- read_pvar_text(paths$pvar)
+    info <- data.frame(
+      id = as.character(pvar$id),
+      chrom = as.character(pvar$chrom),
+      pos = as.integer(pvar$pos),
+      A1 = as.character(pvar$A1),
+      A2 = as.character(pvar$A2),
+      stringsAsFactors = FALSE
+    )
+  } else {
+    stop("Unsupported genotype_type for lassosum source info: ", genotype_type)
+  }
+  .lassosum_filter_source_info(info, region = region)
+}
+
+.lassosum_match_variants <- function(variant_info, source_info) {
+  target <- .lassosum_prepare_variant_info(variant_info)
+  source <- source_info
+  if (is.null(target) || !nrow(target) || !nrow(source)) {
+    return(data.frame())
+  }
+
+  target$old_idx <- seq_len(nrow(target))
+  source$source_idx <- seq_len(nrow(source))
+  source$chrom <- as.integer(strip_chr_prefix(as.character(source$chrom)))
+  source$pos <- as.integer(source$pos)
+  source$source_A1 <- as.character(source$A1)
+  source$source_A2 <- as.character(source$A2)
+
+  same_source <- source[, c("source_idx", "id", "chrom", "pos", "source_A1", "source_A2"), drop = FALSE]
+  names(same_source) <- c("source_idx", "source_id", "chrom", "pos", "A1", "A2")
+  same <- merge(target, same_source, by = c("chrom", "pos", "A1", "A2"), all = FALSE)
+  same$orientation <- 1L
+
+  flip_source <- source[, c("source_idx", "id", "chrom", "pos", "source_A2", "source_A1"), drop = FALSE]
+  names(flip_source) <- c("source_idx", "source_id", "chrom", "pos", "A1", "A2")
+  flip <- merge(target, flip_source, by = c("chrom", "pos", "A1", "A2"), all = FALSE)
+  flip$orientation <- -1L
+
+  matched <- rbind(same, flip)
+  if (!nrow(matched)) {
+    return(data.frame())
+  }
+
+  matched$orientation_rank <- ifelse(matched$orientation == 1L, 0L, 1L)
+  matched <- matched[order(matched$old_idx, matched$orientation_rank, matched$source_idx), , drop = FALSE]
+  matched <- matched[!duplicated(matched$old_idx), , drop = FALSE]
+  matched <- matched[order(matched$source_idx, matched$orientation_rank, matched$old_idx), , drop = FALSE]
+  matched <- matched[!duplicated(matched$source_idx), , drop = FALSE]
+  matched <- matched[order(matched$source_idx), , drop = FALSE]
+  matched[, c("old_idx", "source_idx", "source_id", "orientation"), drop = FALSE]
+}
+
+.lassosum_first_max <- function(x) {
+  which(x == max(x, na.rm = TRUE))[1]
+}
+
+.lassosum_select_min_fbeta <- function(candidate_beta, candidate_meta) {
+  idx <- which.min(candidate_meta$fbeta)
+  list(
+    beta = candidate_beta[, idx],
+    index = idx,
+    mode = "min_fbeta"
+  )
+}
+
+.lassosum_score_candidates_with_bfile <- function(genotype_source, region, variant_info,
+                                                  candidate_beta, cor_input) {
+  if (!requireNamespace("lassosum", quietly = TRUE)) {
+    stop("The 'lassosum' package is required for PLINK1 pseudovalidation.")
+  }
+  source_info <- .lassosum_read_source_info(genotype_source, "plink1", region = region)
+  matched <- .lassosum_match_variants(variant_info, source_info)
+  if (!nrow(matched)) {
+    stop("No overlapping variants between variant_info and PLINK1 genotype source.")
+  }
+
+  beta_oriented <- sweep(candidate_beta[matched$old_idx, , drop = FALSE], 1, matched$orientation, "*")
+  cor_oriented <- cor_input[matched$old_idx] * matched$orientation
+  extract_ids <- matched$source_id
+  sd_vec <- lassosum:::sd.bfile(bfile = genotype_source, extract = extract_ids, trace = 0)
+  scores <- lassosum:::pseudovalidation(
+    bfile = genotype_source,
+    beta = beta_oriented,
+    cor = cor_oriented,
+    extract = extract_ids,
+    sd = sd_vec
+  )
+  scores[is.na(scores)] <- -Inf
+  idx <- .lassosum_first_max(scores)
+  list(
+    beta = candidate_beta[, idx],
+    index = idx,
+    mode = "plink1_pseudovalidation"
+  )
+}
+
+.lassosum_score_candidates_with_sketch <- function(genotype_source, region, variant_info,
+                                                   candidate_beta, cor_input) {
+  sketch <- load_plink2_data(genotype_source, region = region)
+  source_info <- data.frame(
+    id = as.character(sketch$variant_info$id),
+    chrom = as.character(sketch$variant_info$chrom),
+    pos = as.integer(sketch$variant_info$pos),
+    A1 = as.character(sketch$variant_info$A1),
+    A2 = as.character(sketch$variant_info$A2),
+    stringsAsFactors = FALSE
+  )
+  matched <- .lassosum_match_variants(variant_info, source_info)
+  if (!nrow(matched)) {
+    stop("No overlapping variants between variant_info and PLINK2 genotype source.")
+  }
+
+  X <- sketch$X[, matched$source_idx, drop = FALSE]
+  X <- mean_impute(X)
+  beta_oriented <- sweep(candidate_beta[matched$old_idx, , drop = FALSE], 1, matched$orientation, "*")
+  cor_oriented <- cor_input[matched$old_idx] * matched$orientation
+  sd_vec <- apply(X, 2, stats::sd)
+  sd_vec[!is.finite(sd_vec) | sd_vec <= 0] <- Inf
+  scaled_beta <- sweep(beta_oriented, 1, sd_vec, "/")
+  scaled_beta[!is.finite(scaled_beta)] <- 0
+  pred <- X %*% scaled_beta
+  pred_centered <- scale(pred, scale = FALSE)
+  bxxb <- colSums(pred_centered^2) / nrow(pred_centered)
+  bxy <- as.numeric(crossprod(cor_oriented, beta_oriented))
+  scores <- as.numeric(bxy / sqrt(bxxb))
+  scores[!is.finite(scores)] <- -Inf
+  idx <- .lassosum_first_max(scores)
+  list(
+    beta = candidate_beta[, idx],
+    index = idx,
+    mode = "plink2_sketch_pseudovalidation"
+  )
+}
+
+.lassosum_resolve_selection_mode <- function(selection, genotype_source, genotype_type) {
+  if (selection == "min_fbeta") {
+    return("min_fbeta")
+  }
+
+  if (genotype_type == "auto") {
+    if (is.null(genotype_source) || identical(genotype_source, "")) {
+      return("min_fbeta")
+    }
+    if (has_plink1_files(genotype_source)) {
+      return("plink1")
+    }
+    if (has_plink2_files(genotype_source)) {
+      return("plink2")
+    }
+    return("min_fbeta")
+  }
+
+  if (genotype_type %in% c("plink1", "plink2")) {
+    return(genotype_type)
+  }
+
+  "min_fbeta"
+}
+
 #' Extract weights from lassosum_rss with shrinkage grid search
 #'
 #' Searches over a grid of shrinkage parameters \code{s} (default:
 #' \code{c(0.2, 0.5, 0.9, 1.0)}, matching the original lassosum and OTTERS).
 #' For each \code{s}, the LD matrix is shrunk as \code{(1-s)*R + s*I}, then
-#' \code{lassosum_rss()} is called across the lambda path. The best
-#' \code{(s, lambda)} combination is selected by the lowest objective value.
+#' \code{lassosum_rss()} is called across the lambda path. Candidate selection
+#' depends on the available source:
+#' \itemize{
+#'   \item PLINK1 \code{.bed/.bim/.fam}: published \code{lassosum} pseudovalidation
+#'   \item PLINK2 sketch \code{.pgen/.pvar/.psam}: sketch-genotype pseudovalidation
+#'   \item no genotype-backed source: fallback to \code{min(fbeta)} with warning
+#' }
 #'
 #' @details
-#' Model selection uses \code{min(fbeta)} (penalized objective) rather than
-#' the pseudovalidation approach from the original lassosum R package. Empirical
-#' comparison over 20 random trials (n=300, p=50, 3 causal) shows no systematic
-#' advantage for either method: pseudovalidation won 4/20, min(fbeta) won 6/20,
-#' tied 10/20. The shrinkage grid over \code{s} provides the primary regularization;
-#' lambda selection within each \code{s} has minimal impact.
+#' OTTERS compatibility requires source-aware model selection. When a PLINK1
+#' genotype source is available, this wrapper reproduces the old
+#' genotype-backed pseudovalidation selector. For PLINK2 sketch inputs, it
+#' performs pseudovalidation on the restored \code{U} matrix using
+#' \code{U_MIN/U_MAX}. If no genotype-backed source is provided, the function
+#' warns and falls back to \code{min(fbeta)}.
 #'
 #' @param stat A list with \code{$b} (effect sizes) and \code{$n} (per-variant sample sizes).
 #' @param LD LD correlation matrix R (single matrix, NOT pre-shrunk).
 #' @param s Numeric vector of shrinkage parameters to search over. Default:
 #'   \code{c(0.2, 0.5, 0.9, 1.0)} following Mak et al (2017) and OTTERS.
+#' @param selection Selection strategy. Default \code{"auto"} uses
+#'   source-aware pseudovalidation when possible and falls back to
+#'   \code{"min_fbeta"} otherwise.
+#' @param genotype_source Optional PLINK prefix for genotype-backed or
+#'   sketch-genotype selection.
+#' @param genotype_type Source type for \code{genotype_source}: \code{"auto"},
+#'   \code{"plink1"}, \code{"plink2"}, or \code{"none"}.
+#' @param region Optional region string \code{"chr:start-end"} used to filter
+#'   genotype source variants.
+#' @param variant_info Optional variant annotation aligned to \code{stat} and
+#'   \code{LD}; must identify \code{chrom}, \code{pos}, \code{A1}, \code{A2}.
 #' @param ... Additional arguments passed to \code{lassosum_rss()}.
 #'
 #' @return A numeric vector of the posterior SNP coefficients at the best (s, lambda).
 #' @export
-lassosum_rss_weights <- function(stat, LD, s = c(0.2, 0.5, 0.9, 1.0), ...) {
+lassosum_rss_weights <- function(stat, LD, s = c(0.2, 0.5, 0.9, 1.0),
+                                 selection = c("auto", "min_fbeta"),
+                                 genotype_source = NULL,
+                                 genotype_type = c("auto", "plink1", "plink2", "none"),
+                                 region = NULL,
+                                 variant_info = NULL,
+                                 ...) {
+  selection <- match.arg(selection)
+  genotype_type <- match.arg(genotype_type)
   n <- median(stat$n)
   p <- nrow(LD)
-  best_fbeta <- Inf
-  best_beta  <- rep(0, p)
-
-  # Clamp marginal correlations to (-1, 1) as required by lassosum.
-  # This is lassosum-specific — other methods (PRS-CS, SDPR) handle
-  # their own regularization and should not be globally rescaled.
-  # Matches OTTERS shrink_factor logic (PRSmodels/lassosum.R lines 71-77).
-  bhat <- stat$b
-  max_abs_b <- max(abs(bhat))
-  if (max_abs_b >= 1) {
-    bhat <- bhat / (max_abs_b / 0.9999)
-  }
+  cor_input <- .lassosum_clamp_cor(.lassosum_cor_from_stat(stat, n = n, p = p))
+  solver_input <- cor_input * sqrt(n)
+  candidate_beta <- NULL
+  candidate_meta <- list()
 
   for (s_val in s) {
-    # Shrink LD: R_s = (1 - s) * R + s * I
     LD_s <- (1 - s_val) * LD + s_val * diag(p)
-    model <- lassosum_rss(bhat = bhat, LD = list(blk1 = LD_s), n = n, ...)
-    min_fbeta <- min(model$fbeta)
-    if (min_fbeta < best_fbeta) {
-      best_fbeta <- min_fbeta
-      best_beta  <- model$beta_est
+    model <- lassosum_rss(bhat = solver_input, LD = list(blk1 = LD_s), n = n, ...)
+    candidate_beta <- cbind(candidate_beta, model$beta)
+    candidate_meta[[length(candidate_meta) + 1L]] <- data.frame(
+      s = rep(s_val, length(model$lambda)),
+      lambda = model$lambda,
+      fbeta = model$fbeta,
+      stringsAsFactors = FALSE
+    )
+  }
+  candidate_meta <- do.call(rbind, candidate_meta)
+
+  mode <- .lassosum_resolve_selection_mode(selection, genotype_source, genotype_type)
+  variant_info_prepped <- .lassosum_prepare_variant_info(variant_info, expected_n = p)
+  selector_result <- NULL
+
+  if (mode == "plink1" || mode == "plink2") {
+    selector_result <- tryCatch(
+      {
+        if (is.null(variant_info_prepped)) {
+          stop("variant_info is required for genotype-backed lassosum selection.")
+        }
+        if (is.null(genotype_source) || identical(genotype_source, "")) {
+          stop("genotype_source is required for genotype-backed lassosum selection.")
+        }
+        if (mode == "plink1") {
+          .lassosum_score_candidates_with_bfile(
+            genotype_source = genotype_source,
+            region = region,
+            variant_info = variant_info_prepped,
+            candidate_beta = candidate_beta,
+            cor_input = cor_input
+          )
+        } else {
+          .lassosum_score_candidates_with_sketch(
+            genotype_source = genotype_source,
+            region = region,
+            variant_info = variant_info_prepped,
+            candidate_beta = candidate_beta,
+            cor_input = cor_input
+          )
+        }
+      },
+      error = function(e) {
+        warning(
+          "lassosum_rss_weights could not use source-aware pseudovalidation (",
+          conditionMessage(e),
+          "); falling back to min(fbeta), which is not old-OTTERS-compatible."
+        )
+        NULL
+      }
+    )
+  }
+
+  if (is.null(selector_result)) {
+    if (mode == "min_fbeta" && selection == "auto") {
+      warning(
+        "lassosum_rss_weights has no genotype-backed source for pseudovalidation; ",
+        "falling back to min(fbeta), which is not old-OTTERS-compatible."
+      )
     }
+    selector_result <- .lassosum_select_min_fbeta(candidate_beta, candidate_meta)
   }
 
-  return(best_beta)
+  best_beta <- as.numeric(selector_result$beta)
+  attr(best_beta, "lassosum_selection") <- c(
+    mode = selector_result$mode,
+    index = selector_result$index,
+    s = candidate_meta$s[selector_result$index],
+    lambda = candidate_meta$lambda[selector_result$index]
+  )
+  best_beta
 }
 
 #' Compute Weights Using ncvreg with SCAD or MCP Penalty
diff --git a/inst/notes/otters_lassosum_selector_fix_pr.md b/inst/notes/otters_lassosum_selector_fix_pr.md
new file mode 100644
index 00000000..1e67ce49
--- /dev/null
+++ b/inst/notes/otters_lassosum_selector_fix_pr.md
@@ -0,0 +1,120 @@
+# OTTERS Lassosum Selector Fix
+
+## Summary
+
+This change makes OTTERS lassosum selection source-aware again.
+
+- PLINK1 `bed/bim/fam` inputs use old genotype-backed `lassosum` pseudovalidation.
+- PLINK2 sketch `pgen/pvar/psam` inputs use sketch-genotype pseudovalidation on restored `U`.
+- Inputs with no genotype-backed source still fall back to `min(fbeta)`, with an explicit warning that the fallback is not old-OTTERS-compatible.
+- The OTTERS wrapper also stops double-scaling lassosum input before it reaches the low-level solver.
+
+## Example 206
+
+Fixture `chr1_206088859_208088859__ENSG00000123843` isolates the regression.
+
+- old saved vs old direct published `lassosum`: Pearson `1.0`, `0` opposite-sign variants
+- corrected-scaling + `min(fbeta)`: Pearson about `0.360`, `1309` opposite-sign variants
+
+Old published lassosum selected `s=0.2, lambda=1e-4`. Corrected-scaling plus `min(fbeta)` still selected `s=1, lambda=1e-4`.
+
+Implication:
+
+- this is not a grid-definition problem
+- both candidates are already on the old grid
+- the regression comes from changing the selection rule over the same grid
+
+For this fixture, old pseudovalidation is a degenerate tie case, so old OTTERS selects the first maximum. Replacing that with `min(fbeta)` moves the selected model to the opposite end of the same grid.
+
+## Example 161
+
+Fixture `chr1_161023868_163023868__ENSG00000162745` is the informative selector case.
+
+On the common matched set of `4298` variants:
+
+- exact PLINK1 bfile pseudovalidation best candidate: `soft_lambda=0.041050213`
+- sketch-genotype pseudovalidation best candidate: `soft_lambda=0.021788613`
+- score agreement remains high:
+  - Pearson `0.9971`
+  - Spearman `0.9412`
+  - max absolute difference `0.0474`
+
+When the sketch path is forced to use `sd_bfile`, the winning candidate returns to `soft_lambda=0.041050213`.
+
+Implication:
+
+- this is not a variant-overlap problem
+- this is a predictive normalization problem inside pseudovalidation
+- restored sketch `U` recovers the score surface closely
+- sketch-derived `sd` is not identical to old `lassosum:::sd.bfile()`
+
+So PLINK1 is the exact old-parity path, while PLINK2 sketch is the right source-aware selector for sketch inputs but remains an approximation relative to old PLINK1 behavior.
+
+## What Is Fixed
+
+## `R/regularized_regression.R`
+
+- `lassosum_rss_weights()` no longer defaults every OTTERS run to `min(fbeta)`.
+- It now selects by source:
+  - PLINK1: published `lassosum` pseudovalidation
+  - PLINK2 sketch: restored-`U` pseudovalidation
+  - no genotype-backed source: warning + `min(fbeta)` fallback
+- It preserves old first-max tie behavior for pseudovalidation selectors.
+- It passes lassosum input in correlation units only once before the low-level solver converts to `z / sqrt(n)`.
+
+What actually failed:
+
+- old OTTERS-compatible selection was replaced with `min(fbeta)`
+- the OTTERS wrapper also double-scaled lassosum input
+
+Why it worked earlier:
+
+- old OTTERS used published `lassosum.pipeline(..., test.bfile = bfile)` followed by `pseudovalidate(out)`
+
+Why it failed here:
+
+- the refactor changed the selector and the scaling contract
+- fixture `206` exposed the selector regression clearly because it is a tied pseudovalidation table where first-max and `min(fbeta)` pick different candidates
+
+Classification:
+
+- selector replacement: real behavioral regression
+- double scaling: real implementation bug
+- fixture `206` tie case: edge-case trigger that exposed the regression cleanly
+
+Compatibility:
+
+- exact old behavior is restored for PLINK1 genotype-backed OTTERS runs
+- PLINK2 sketch inputs now use a source-aware selector instead of the generic fallback
+- pure precomputed-LD inputs remain on the fallback path and now say so explicitly
+
+## `R/otters.R`
+
+- `otters_weights()` now forwards aligned `chrom/pos/A1/A2` variant metadata to lassosum when available.
+- This restores the variant-alignment information needed for source-aware selection.
+
+Classification:
+
+- wrapper or interface drift
+
+## Tests
+
+Focused validation for this change:
+
+- `tests/testthat/test_rr_lassosum.R`
+- `tests/testthat/test_rr_dispatch.R`
+- `tests/testthat/test_otters.R`
+
+Local evidence paths used during debugging:
+
+- `temp_reference/otters_regression/lassosum_oldR_direct_206/`
+- `temp_reference/otters_regression/lassosum_forensics_206/`
+- `temp_reference/otters_regression/pseudovalidation_sketch_vs_bfile/`
+
+## Narrow Compatibility Note
+
+This change only alters the OTTERS lassosum selection path.
+
+- It restores exact old behavior when a legacy PLINK1 genotype source is available.
+- It makes PLINK2 sketch inputs use a sketch-genotype selector instead of the old generic fallback.
+- It does not change PRS-CS or SDPR behavior.
diff --git a/tests/testthat/test_otters.R b/tests/testthat/test_otters.R
index fe3e4135..d31ce2fd 100644
--- a/tests/testthat/test_otters.R
+++ b/tests/testthat/test_otters.R
@@ -9,7 +9,7 @@ test_that("otters_weights returns named list of weight vectors", {
   R <- diag(p)
   sumstats <- data.frame(z = z)
   result <- otters_weights(sumstats, R, n,
-    methods = list(lassosum_rss = list()),
+    methods = list(lassosum_rss = list(selection = "min_fbeta")),
     p_thresholds = c(0.05)
   )
   expect_type(result, "list")
@@ -26,7 +26,7 @@ test_that("otters_weights computes z from beta/se if z missing", {
   sumstats <- data.frame(beta = rnorm(p, sd = 0.1), se = rep(0.05, p))
   R <- diag(p)
   result <- otters_weights(sumstats, R, n,
-    methods = list(lassosum_rss = list()),
+    methods = list(lassosum_rss = list(selection = "min_fbeta")),
     p_thresholds = c(0.05)
   )
   expect_true("lassosum_rss" %in% names(result))
@@ -75,7 +75,7 @@ test_that("otters_weights with multiple methods returns all", {
   sumstats <- data.frame(z = z)
   result <- otters_weights(sumstats, R, n,
     methods = list(
-      lassosum_rss = list(),
+      lassosum_rss = list(selection = "min_fbeta"),
       prs_cs = list(n_iter = 50, n_burnin = 10, thin = 2, seed = 42)
     ),
     p_thresholds = c(0.001, 0.05)
@@ -87,6 +87,48 @@ test_that("otters_weights with multiple methods returns all", {
   }
 })
 
+test_that("otters_weights forwards variant_info only to lassosum", {
+  p <- 5
+  n <- 100
+  z <- rnorm(p)
+  R <- diag(p)
+  sumstats <- data.frame(
+    chrom = rep(1, p),
+    pos = seq_len(p),
+    A1 = c("A", "C", "G", "T", "A"),
+    A2 = c("G", "T", "A", "C", "G"),
+    z = z,
+    stringsAsFactors = FALSE
+  )
+  captured <- new.env(parent = emptyenv())
+  local_mocked_bindings(
+    lassosum_rss_weights = function(stat, LD, variant_info = NULL, ...) {
+      captured$lassosum_variant_info <- variant_info
+      captured$lassosum_dots <- list(...)
+      rep(0.1, nrow(LD))
+    },
+    prs_cs_weights = function(stat, LD, ...) {
+      captured$prs_cs_dots <- list(...)
+      rep(0.2, nrow(LD))
+    }
+  )
+
+  result <- otters_weights(
+    sumstats, R, n,
+    methods = list(
+      lassosum_rss = list(selection = "min_fbeta"),
+      prs_cs = list(phi = 1e-4)
+    ),
+    p_thresholds = NULL,
+    check_ld_method = NULL
+  )
+
+  expect_equal(result$lassosum_rss, rep(0.1, p))
+  expect_equal(result$prs_cs, rep(0.2, p))
+  expect_equal(captured$lassosum_variant_info, sumstats[, c("chrom", "pos", "A1", "A2")])
+  expect_null(captured$prs_cs_dots$variant_info)
+})
+
 # ---- otters_association ----
 test_that("otters_association returns correct structure", {
   set.seed(42)
@@ -159,7 +201,7 @@ test_that("otters_weights + otters_association end-to-end on simulated data", {
   # Stage I: train weights
   sumstats <- data.frame(z = eqtl_z)
   weights <- otters_weights(sumstats, R, n_eqtl,
-    methods = list(lassosum_rss = list()),
+    methods = list(lassosum_rss = list(selection = "min_fbeta")),
     p_thresholds = c(0.05)
   )
   expect_true(length(weights) >= 2)
diff --git a/tests/testthat/test_rr_dispatch.R b/tests/testthat/test_rr_dispatch.R
index 40e5f778..a17e44e9 100644
--- a/tests/testthat/test_rr_dispatch.R
+++ b/tests/testthat/test_rr_dispatch.R
@@ -75,10 +75,14 @@ test_that("lassosum_rss_weights dispatches to lassosum_rss once per s value", {
     lassosum_rss = function(bhat, LD, n, ...) {
       call_log$calls <- c(call_log$calls,
                           list(list(bhat = bhat, LD = LD, n = n)))
-      list(beta_est = rep(0.05, length(bhat)), fbeta = c(1.0, 0.5))
+      list(
+        beta = cbind(rep(0, length(bhat)), rep(0.05, length(bhat))),
+        lambda = c(0.05, 0.01),
+        fbeta = c(1.0, 0.5)
+      )
     }
   )
-  lassosum_rss_weights(stat = stat, LD = R, s = c(0.2, 0.9))
+  lassosum_rss_weights(stat = stat, LD = R, s = c(0.2, 0.9), selection = "min_fbeta")
   expect_equal(length(call_log$calls), 2L)
   expect_equal(call_log$calls[[1]]$n, 100)
   expect_equal(length(call_log$calls[[1]]$bhat), p)
@@ -86,6 +90,110 @@ test_that("lassosum_rss_weights dispatches to lassosum_rss once per s value", {
   expect_false(identical(call_log$calls[[1]]$LD, call_log$calls[[2]]$LD))
 })
 
+test_that("lassosum_rss_weights auto mode dispatches to PLINK1 pseudovalidation", {
+  p <- 4
+  stat <- list(b = c(0.1, -0.2, 0.05, 0.03), n = rep(100, p))
+  R <- diag(p)
+  variant_info <- data.frame(
+    chrom = 1,
+    pos = 1:p,
+    A1 = c("A", "C", "G", "T"),
+    A2 = c("G", "T", "A", "C"),
+    stringsAsFactors = FALSE
+  )
+
+  local_mocked_bindings(
+    has_plink1_files = function(prefix) TRUE,
+    has_plink2_files = function(prefix) FALSE,
+    lassosum_rss = function(bhat, LD, n, ...) {
+      list(
+        beta = matrix(seq_len(length(bhat) * 2), nrow = length(bhat)),
+        lambda = c(0.05, 0.01),
+        fbeta = c(2, 1)
+      )
+    },
+    .lassosum_score_candidates_with_bfile = function(...) {
+      list(beta = rep(0.7, p), index = 2L, mode = "plink1_pseudovalidation")
+    }
+  )
+
+  result <- lassosum_rss_weights(
+    stat = stat,
+    LD = R,
+    s = 0.5,
+    selection = "auto",
+    genotype_source = "/fake/plink1",
+    variant_info = variant_info
+  )
+
+  expect_equal(c(result), rep(0.7, p))
+  expect_equal(unname(attr(result, "lassosum_selection")["mode"]), "plink1_pseudovalidation")
+})
+
+test_that("lassosum_rss_weights auto mode dispatches to PLINK2 sketch pseudovalidation", {
+  p <- 4
+  stat <- list(b = c(0.1, -0.2, 0.05, 0.03), n = rep(100, p))
+  R <- diag(p)
+  variant_info <- data.frame(
+    chrom = 1,
+    pos = 1:p,
+    A1 = c("A", "C", "G", "T"),
+    A2 = c("G", "T", "A", "C"),
+    stringsAsFactors = FALSE
+  )
+
+  local_mocked_bindings(
+    has_plink1_files = function(prefix) FALSE,
+    has_plink2_files = function(prefix) TRUE,
+    lassosum_rss = function(bhat, LD, n, ...) {
+      list(
+        beta = matrix(seq_len(length(bhat) * 2), nrow = length(bhat)),
+        lambda = c(0.05, 0.01),
+        fbeta = c(2, 1)
+      )
+    },
+    .lassosum_score_candidates_with_sketch = function(...) {
+      list(beta = rep(0.4, p), index = 1L, mode = "plink2_sketch_pseudovalidation")
+    }
+  )
+
+  result <- lassosum_rss_weights(
+    stat = stat,
+    LD = R,
+    s = 0.5,
+    selection = "auto",
+    genotype_source = "/fake/plink2",
+    variant_info = variant_info
+  )
+
+  expect_equal(c(result), rep(0.4, p))
+  expect_equal(unname(attr(result, "lassosum_selection")["mode"]), "plink2_sketch_pseudovalidation")
+})
+
+test_that("lassosum_rss_weights warns and falls back to min(fbeta) without genotype source", {
+  p <- 4
+  stat <- list(b = c(0.1, -0.2, 0.05, 0.03), n = rep(100, p))
+  R <- diag(p)
+  expected <- rep(0.2, p)
+
+  local_mocked_bindings(
+    lassosum_rss = function(bhat, LD, n, ...) {
+      list(
+        beta = cbind(rep(0, length(bhat)), expected),
+        lambda = c(0.05, 0.01),
+        fbeta = c(5, 1)
+      )
+    }
+  )
+
+  result <- expect_warning(
+    lassosum_rss_weights(stat = stat, LD = R, s = 0.5, selection = "auto"),
+    "not old-OTTERS-compatible"
+  )
+  expect_equal(c(result), expected)
+  expect_equal(unname(attr(result, "lassosum_selection")["mode"]), "min_fbeta")
+})
+
 test_that("bayes_{n,l,a,c,r}_weights each dispatch to bayes_alphabet_weights with correct method", {
   set.seed(42)
   X <- matrix(rnorm(50), nrow = 10)
diff --git a/tests/testthat/test_rr_lassosum.R b/tests/testthat/test_rr_lassosum.R
index d620d0fc..fda14943 100644
--- a/tests/testthat/test_rr_lassosum.R
+++ b/tests/testthat/test_rr_lassosum.R
@@ -76,12 +76,24 @@ test_that("lassosum_rss_weights calls lassosum_rss and returns beta_est", {
     R[i + 1, i] <- 0.3
   }
   stat <- list(b = bhat, n = rep(n, p))
-  result <- lassosum_rss_weights(stat = stat, LD = R)
+  expected <- seq_len(p) * 0.02
+  local_mocked_bindings(
+    lassosum_rss = function(bhat, LD, n, ...) {
+      list(
+        beta = cbind(rep(0, length(bhat)), expected),
+        lambda = c(0.05, 0.01),
+        fbeta = c(1, 0.1)
+      )
+    }
+  )
+  result <- lassosum_rss_weights(stat = stat, LD = R, s = 0.5, selection = "min_fbeta")
   expect_equal(length(result), p)
   expect_true(is.numeric(result))
+  expect_equal(c(result), expected)
+  expect_equal(unname(attr(result, "lassosum_selection")["mode"]), "min_fbeta")
 })
 
-test_that("lassosum_rss_weights rescales bhat when max(abs(bhat)) >= 1", {
+test_that("lassosum_rss_weights clamps correlation input before scaling", {
   set.seed(42)
   p <- 5
   n <- 100
@@ -93,13 +105,17 @@ test_that("lassosum_rss_weights rescales bhat when max(abs(bhat)) >= 1", {
   local_mocked_bindings(
     lassosum_rss = function(bhat, LD, n, ...) {
       captured <<- bhat
-      list(beta_est = rep(0, length(bhat)), fbeta = 1)
+      list(
+        beta = matrix(0, nrow = length(bhat), ncol = 2),
+        lambda = c(0.05, 0.01),
+        fbeta = c(1, 0.5)
+      )
     }
   )
-  result <- lassosum_rss_weights(stat = stat, LD = R, s = 0.5)
-  # Captured bhat should have been rescaled so max abs is just under 1
-  expect_true(max(abs(captured)) < 1)
-  expect_equal(max(abs(captured)), 0.9999, tolerance = 1e-9)
+  result <- lassosum_rss_weights(stat = stat, LD = R, s = 0.5, selection = "min_fbeta")
+  scaled_cor <- captured / sqrt(n)
+  expect_true(max(abs(scaled_cor)) < 1)
+  expect_equal(max(abs(scaled_cor)), 0.9999, tolerance = 1e-9)
   # Sign-preserving rescale
-  expect_true(all(sign(captured) == sign(bhat)))
+  expect_true(all(sign(scaled_cor) == sign(bhat)))
 })

From 5aeaaa5390790462ac20e43a285549840329987d Mon Sep 17 00:00:00 2001
From: Hao Sun <sunh14@lc06e01.chimera.hpc.mssm.edu>
Date: Thu, 23 Apr 2026 23:40:09 -0400
Subject: [PATCH 2/5] Update lassosum PR note with LD-only selector rationale

---
 inst/notes/otters_lassosum_selector_fix_pr.md | 81 +++++++++++++++++--
 1 file changed, 75 insertions(+), 6 deletions(-)

diff --git a/inst/notes/otters_lassosum_selector_fix_pr.md b/inst/notes/otters_lassosum_selector_fix_pr.md
index 1e67ce49..b2df44a0 100644
--- a/inst/notes/otters_lassosum_selector_fix_pr.md
+++ b/inst/notes/otters_lassosum_selector_fix_pr.md
@@ -2,12 +2,17 @@
 
 ## Summary
 
-This change makes OTTERS lassosum selection source-aware again.
+This change fixes the immediate OTTERS lassosum regression, but the updated diagnostics also sharpen the longer-term conclusion: genotype is not fundamentally required for lassosum selection once the selector is written in LD form.
 
-- PLINK1 `bed/bim/fam` inputs use old genotype-backed `lassosum` pseudovalidation.
-- PLINK2 sketch `pgen/pvar/psam` inputs use sketch-genotype pseudovalidation on restored `U`.
-- Inputs with no genotype-backed source still fall back to `min(fbeta)`, with an explicit warning that the fallback is not old-OTTERS-compatible.
+- The regression was caused by replacing old pseudovalidation-based model selection with `min(fbeta)`.
+- The compatibility patch in this PR restores old behavior conservatively:
+  - PLINK1 `bed/bim/fam` inputs use old genotype-backed `lassosum` pseudovalidation.
+  - PLINK2 sketch `pgen/pvar/psam` inputs use sketch-side pseudovalidation on restored `U`.
+  - inputs with no genotype-backed source still fall back to `min(fbeta)`, with an explicit warning that the fallback is not old-OTTERS-compatible.
 - The OTTERS wrapper also stops double-scaling lassosum input before it reaches the low-level solver.
+- New diagnostics after the patch show that the selector itself can be expressed as an LD quadratic form:
+  `score(beta) = (c^T beta) / sqrt(beta^T R beta)`.
+  On PLINK1-derived LD, that LD-only score reproduces old genotype-backed pseudovalidation almost exactly.
 
 ## Example 206
 
@@ -48,7 +53,63 @@ Implication:
 - restored sketch `U` recovers the score surface closely
 - sketch-derived `sd` is not identical to old `lassosum:::sd.bfile()`
 
-So PLINK1 is the exact old-parity path, while PLINK2 sketch is the right source-aware selector for sketch inputs but remains an approximation relative to old PLINK1 behavior.
+So PLINK1 is the exact old-parity oracle in this PR, but it is no longer the conceptual end-state. The real conclusion is:
+
+- genotype is not fundamentally needed for lassosum selection
+- the LD quadratic selector is the mathematically correct formulation
+- the remaining mismatch is specific to how the sketch-side `R` / scaling is constructed, not to the selector formula itself
+
+## LD-Only Follow-up
+
+After the compatibility patch, we ran a direct genotype-vs-LD pseudovalidation diagnostic using the same candidate matrix and the same aligned `cor` vector.
+
+For each candidate `beta_k`, old pseudovalidation computes:
+
+```text
+scaled_beta = beta / sd
+pred        = X * scaled_beta
+score       = (c^T beta) / sqrt(Var(pred))
+```
+
+After centering and standardizing the matrix columns by the same `sd`, this becomes:
+
+```text
+score(beta) = (c^T beta) / sqrt(beta^T R beta)
+```
+
+where `R` is the standardized LD correlation matrix.
+
+### PLINK1-derived LD
+
+The LD quadratic score matches genotype-backed pseudovalidation essentially exactly.
+
+- Fixture `161`:
+  - genotype best: `soft_lambda=0.041050213`, score `0.3922803981`
+  - LD-only PLINK1 best: `soft_lambda=0.041050213`, score `0.3922601179`
+  - Pearson `0.9999999`, same best candidate `TRUE`
+- Fixture `206`:
+  - genotype best: `soft_lambda=0.029906976`, score `0.3895932657`
+  - LD-only PLINK1 best: `soft_lambda=0.029906976`, score `0.3897370312`
+  - Pearson `1.0000000`, same best candidate `TRUE`
+
+This validates the LD-only quadratic selector. Genotype was only one way the old code evaluated the score; it is not inherently required once the standardized `R` is available.
+
+### Sketch-derived LD
+
+The same LD-only formulation,
+`score(beta) = (c^T beta) / sqrt(beta^T R_sketch beta)`,
+does **not** yet reproduce the genotype selector on the informative `161` fixture.
+
+- Fixture `161`:
+  - genotype best: `soft_lambda=0.041050213`
+  - LD-only sketch best: `soft_lambda=0.021788613`
+  - Pearson `0.9971044`, same best candidate `FALSE`
+- Fixture `206`:
+  - genotype best: `soft_lambda=0.029906976`
+  - LD-only sketch best: `soft_lambda=0.029906976`
+  - Pearson `0.9999503`, same best candidate `TRUE`
+
+So the unresolved problem is no longer the selector formula. It is the sketch-side construction of a standardized `R` that matches the PLINK1/genotype-backed path.
 
 ## What Is Fixed
 
@@ -88,6 +149,12 @@ Compatibility:
 - PLINK2 sketch inputs now use a source-aware selector instead of the generic fallback
 - pure precomputed-LD inputs remain on the fallback path and now say so explicitly
 
+Important clarification from the new diagnostics:
+
+- this PR should not be read as claiming genotype is necessary for lassosum
+- it should be read as a compatibility patch that restores the old selector while the cleaner LD-only selector is validated
+- the LD-only selector is now validated on PLINK1-derived LD; the remaining work is to make the sketch-derived LD path produce the same standardized `R`
+
 ## `R/otters.R`
 
 - `otters_weights()` now forwards aligned `chrom/pos/A1/A2` variant metadata to lassosum when available.
@@ -116,5 +183,7 @@ Local evidence paths used during debugging:
 This change only alters the OTTERS lassosum selection path.
 
 - It restores exact old behavior when a legacy PLINK1 genotype source is available.
-- It makes PLINK2 sketch inputs use a sketch-genotype selector instead of the old generic fallback.
+- It makes PLINK2 sketch inputs use a sketch-side selector instead of the old generic fallback.
 - It does not change PRS-CS or SDPR behavior.
+
+The new diagnostics imply that a future cleanup should replace the current genotype/sketch split with one format-generic LD-only selector once the sketch-side `R` construction is brought into line with the PLINK1 path.

From e94095038bf112f5dd572a1fb0de4642db084bc0 Mon Sep 17 00:00:00 2001
From: Hao Sun <sunh14@lc06e01.chimera.hpc.mssm.edu>
Date: Thu, 23 Apr 2026 23:56:41 -0400
Subject: [PATCH 3/5] Use LD-quadratic lassosum selection in OTTERS

---
 R/otters.R                                    |  28 +-
 R/regularized_regression.R                    | 288 ++----------------
 inst/notes/otters_lassosum_selector_fix_pr.md | 215 ++++++-------
 tests/testthat/test_otters.R                  |  30 +-
 tests/testthat/test_rr_dispatch.R             |  84 ++---
 tests/testthat/test_rr_lassosum.R             |   4 +-
 6 files changed, 151 insertions(+), 498 deletions(-)

diff --git a/R/otters.R b/R/otters.R
index 8f41ba90..e2823bca 100644
--- a/R/otters.R
+++ b/R/otters.R
@@ -50,7 +50,7 @@
 #' R <- diag(p)
 #' sumstats <- data.frame(z = z)
 #' weights <- otters_weights(sumstats, R, n,
-#'   methods = list(lassosum_rss = list(selection = "min_fbeta")),
+#'   methods = list(lassosum_rss = list()),
 #'   p_thresholds = c(0.05))
 #'
 #' @export
@@ -87,24 +87,6 @@ otters_weights <- function(sumstats, LD, n,
 
   p <- nrow(sumstats)
   z <- sumstats$z
-  chrom_col <- intersect(c("chrom", "CHROM"), names(sumstats))
-  pos_col <- intersect(c("pos", "POS"), names(sumstats))
-  a1_col <- intersect(c("A1", "a1"), names(sumstats))
-  a2_col <- intersect(c("A2", "a2"), names(sumstats))
-  variant_info <- if (length(chrom_col) &&
-                      length(pos_col) &&
-                      length(a1_col) &&
-                      length(a2_col)) {
-    data.frame(
-      chrom = sumstats[[chrom_col[[1]]]],
-      pos = sumstats[[pos_col[[1]]]],
-      A1 = sumstats[[a1_col[[1]]]],
-      A2 = sumstats[[a2_col[[1]]]],
-      stringsAsFactors = FALSE
-    )
-  } else {
-    NULL
-  }
 
   # Build stat object for _weights() convention
   b <- z / sqrt(n)
@@ -132,13 +114,7 @@ otters_weights <- function(sumstats, LD, n,
       next
     }
     tryCatch({
-      method_args <- methods[[method_name]]
-      if (identical(method_name, "lassosum_rss") &&
-          !is.null(variant_info) &&
-          is.null(method_args$variant_info)) {
-        method_args$variant_info <- variant_info
-      }
-      w <- do.call(fn_name, c(list(stat = stat, LD = LD), method_args))
+      w <- do.call(fn_name, c(list(stat = stat, LD = LD), methods[[method_name]]))
       results[[method_name]] <- as.numeric(w)
     }, error = function(e) {
       warning(sprintf("Method '%s' failed: %s", method_name, e$message))
diff --git a/R/regularized_regression.R b/R/regularized_regression.R
index 3e7b0b0a..27d4b980 100644
--- a/R/regularized_regression.R
+++ b/R/regularized_regression.R
@@ -890,101 +890,6 @@ lassosum_rss <- function(bhat, LD, n,
   cor_input
 }
 
-.lassosum_prepare_variant_info <- function(variant_info, expected_n = NULL) {
-  if (is.null(variant_info)) {
-    return(NULL)
-  }
-  parsed <- parse_variant_id(variant_info)
-  info <- data.frame(
-    chrom = as.integer(parsed$chrom),
-    pos = as.integer(parsed$pos),
-    A1 = as.character(parsed$A1),
-    A2 = as.character(parsed$A2),
-    stringsAsFactors = FALSE
-  )
-  if (!is.null(expected_n) && nrow(info) != expected_n) {
-    stop("variant_info has ", nrow(info), " rows but expected ", expected_n, ".")
-  }
-  info
-}
-
-.lassosum_filter_source_info <- function(info, region = NULL) {
-  if (is.null(region)) {
-    return(info)
-  }
-  parsed <- parse_region(region)
-  keep <- strip_chr_prefix(as.character(info$chrom)) == parsed$chrom &
-    as.integer(info$pos) >= parsed$start &
-    as.integer(info$pos) <= parsed$end
-  info[keep, , drop = FALSE]
-}
-
-.lassosum_read_source_info <- function(genotype_source, genotype_type, region = NULL) {
-  if (genotype_type == "plink1") {
-    bim <- read_bim(paste0(genotype_source, ".bed"))
-    info <- data.frame(
-      id = as.character(bim$id),
-      chrom = as.character(bim$chrom),
-      pos = as.integer(bim$pos),
-      A1 = as.character(bim$a1),
-      A2 = as.character(bim$a0),
-      stringsAsFactors = FALSE
-    )
-  } else if (genotype_type == "plink2") {
-    paths <- resolve_plink2_paths(genotype_source)
-    pvar <- read_pvar_text(paths$pvar)
-    info <- data.frame(
-      id = as.character(pvar$id),
-      chrom = as.character(pvar$chrom),
-      pos = as.integer(pvar$pos),
-      A1 = as.character(pvar$A1),
-      A2 = as.character(pvar$A2),
-      stringsAsFactors = FALSE
-    )
-  } else {
-    stop("Unsupported genotype_type for lassosum source info: ", genotype_type)
-  }
-  .lassosum_filter_source_info(info, region = region)
-}
-
-.lassosum_match_variants <- function(variant_info, source_info) {
-  target <- .lassosum_prepare_variant_info(variant_info)
-  source <- source_info
-  if (is.null(target) || !nrow(target) || !nrow(source)) {
-    return(data.frame())
-  }
-
-  target$old_idx <- seq_len(nrow(target))
-  source$source_idx <- seq_len(nrow(source))
-  source$chrom <- as.integer(strip_chr_prefix(as.character(source$chrom)))
-  source$pos <- as.integer(source$pos)
-  source$source_A1 <- as.character(source$A1)
-  source$source_A2 <- as.character(source$A2)
-
-  same_source <- source[, c("source_idx", "id", "chrom", "pos", "source_A1", "source_A2"), drop = FALSE]
-  names(same_source) <- c("source_idx", "source_id", "chrom", "pos", "A1", "A2")
-  same <- merge(target, same_source, by = c("chrom", "pos", "A1", "A2"), all = FALSE)
-  same$orientation <- 1L
-
-  flip_source <- source[, c("source_idx", "id", "chrom", "pos", "source_A2", "source_A1"), drop = FALSE]
-  names(flip_source) <- c("source_idx", "source_id", "chrom", "pos", "A1", "A2")
-  flip <- merge(target, flip_source, by = c("chrom", "pos", "A1", "A2"), all = FALSE)
-  flip$orientation <- -1L
-
-  matched <- rbind(same, flip)
-  if (!nrow(matched)) {
-    return(data.frame())
-  }
-
-  matched$orientation_rank <- ifelse(matched$orientation == 1L, 0L, 1L)
-  matched <- matched[order(matched$old_idx, matched$orientation_rank, matched$source_idx), , drop = FALSE]
-  matched <- matched[!duplicated(matched$old_idx), , drop = FALSE]
-  matched <- matched[order(matched$source_idx, matched$orientation_rank, matched$old_idx), , drop = FALSE]
-  matched <- matched[!duplicated(matched$source_idx), , drop = FALSE]
-  matched <- matched[order(matched$source_idx), , drop = FALSE]
-  matched[, c("old_idx", "source_idx", "source_id", "orientation"), drop = FALSE]
-}
-
 .lassosum_first_max <- function(x) {
   which(x == max(x, na.rm = TRUE))[1]
 }
@@ -998,149 +903,56 @@ lassosum_rss <- function(bhat, LD, n,
   )
 }
 
-.lassosum_score_candidates_with_bfile <- function(genotype_source, region, variant_info,
-                                                  candidate_beta, cor_input) {
-  if (!requireNamespace("lassosum", quietly = TRUE)) {
-    stop("The 'lassosum' package is required for PLINK1 pseudovalidation.")
-  }
-  source_info <- .lassosum_read_source_info(genotype_source, "plink1", region = region)
-  matched <- .lassosum_match_variants(variant_info, source_info)
-  if (!nrow(matched)) {
-    stop("No overlapping variants between variant_info and PLINK1 genotype source.")
-  }
-
-  beta_oriented <- sweep(candidate_beta[matched$old_idx, , drop = FALSE], 1, matched$orientation, "*")
-  cor_oriented <- cor_input[matched$old_idx] * matched$orientation
-  extract_ids <- matched$source_id
-  sd_vec <- lassosum:::sd.bfile(bfile = genotype_source, extract = extract_ids, trace = 0)
-  scores <- lassosum:::pseudovalidation(
-    bfile = genotype_source,
-    beta = beta_oriented,
-    cor = cor_oriented,
-    extract = extract_ids,
-    sd = sd_vec
-  )
-  scores[is.na(scores)] <- -Inf
-  idx <- .lassosum_first_max(scores)
-  list(
-    beta = candidate_beta[, idx],
-    index = idx,
-    mode = "plink1_pseudovalidation"
-  )
-}
-
-.lassosum_score_candidates_with_sketch <- function(genotype_source, region, variant_info,
-                                                   candidate_beta, cor_input) {
-  sketch <- load_plink2_data(genotype_source, region = region)
-  source_info <- data.frame(
-    id = as.character(sketch$variant_info$id),
-    chrom = as.character(sketch$variant_info$chrom),
-    pos = as.integer(sketch$variant_info$pos),
-    A1 = as.character(sketch$variant_info$A1),
-    A2 = as.character(sketch$variant_info$A2),
-    stringsAsFactors = FALSE
-  )
-  matched <- .lassosum_match_variants(variant_info, source_info)
-  if (!nrow(matched)) {
-    stop("No overlapping variants between variant_info and PLINK2 genotype source.")
-  }
-
-  X <- sketch$X[, matched$source_idx, drop = FALSE]
-  X <- mean_impute(X)
-  beta_oriented <- sweep(candidate_beta[matched$old_idx, , drop = FALSE], 1, matched$orientation, "*")
-  cor_oriented <- cor_input[matched$old_idx] * matched$orientation
-  sd_vec <- apply(X, 2, stats::sd)
-  sd_vec[!is.finite(sd_vec) | sd_vec <= 0] <- Inf
-  scaled_beta <- sweep(beta_oriented, 1, sd_vec, "/")
-  scaled_beta[!is.finite(scaled_beta)] <- 0
-  pred <- X %*% scaled_beta
-  pred_centered <- scale(pred, scale = FALSE)
-  bxxb <- colSums(pred_centered^2) / nrow(pred_centered)
-  bxy <- as.numeric(crossprod(cor_oriented, beta_oriented))
-  scores <- as.numeric(bxy / sqrt(bxxb))
-  scores[!is.finite(scores)] <- -Inf
+.lassosum_select_ld_quadratic <- function(candidate_beta, cor_input, LD) {
+  ld_beta <- LD %*% candidate_beta
+  bxy <- as.numeric(crossprod(cor_input, candidate_beta))
+  bxxb <- colSums(candidate_beta * ld_beta)
+  scores <- rep(-Inf, length(bxy))
+  positive <- is.finite(bxxb) & bxxb > 0
+  scores[positive] <- bxy[positive] / sqrt(bxxb[positive])
   idx <- .lassosum_first_max(scores)
   list(
     beta = candidate_beta[, idx],
     index = idx,
-    mode = "plink2_sketch_pseudovalidation"
+    mode = "ld_quadratic"
   )
 }
 
-.lassosum_resolve_selection_mode <- function(selection, genotype_source, genotype_type) {
-  if (selection == "min_fbeta") {
-    return("min_fbeta")
-  }
-
-  if (genotype_type == "auto") {
-    if (is.null(genotype_source) || identical(genotype_source, "")) {
-      return("min_fbeta")
-    }
-    if (has_plink1_files(genotype_source)) {
-      return("plink1")
-    }
-    if (has_plink2_files(genotype_source)) {
-      return("plink2")
-    }
-    return("min_fbeta")
-  }
-
-  if (genotype_type %in% c("plink1", "plink2")) {
-    return(genotype_type)
-  }
-
-  "min_fbeta"
-}
-
 #' Extract weights from lassosum_rss with shrinkage grid search
 #'
 #' Searches over a grid of shrinkage parameters \code{s} (default:
 #' \code{c(0.2, 0.5, 0.9, 1.0)}, matching the original lassosum and OTTERS).
 #' For each \code{s}, the LD matrix is shrunk as \code{(1-s)*R + s*I}, then
 #' \code{lassosum_rss()} is called across the lambda path. Candidate selection
-#' depends on the available source:
-#' \itemize{
-#'   \item PLINK1 \code{.bed/.bim/.fam}: published \code{lassosum} pseudovalidation
-#'   \item PLINK2 sketch \code{.pgen/.pvar/.psam}: sketch-genotype pseudovalidation
-#'   \item no genotype-backed source: fallback to \code{min(fbeta)} with warning
-#' }
+#' defaults to the LD-only quadratic pseudovalidation score
+#' \deqn{\frac{c^T \beta}{\sqrt{\beta^T R \beta}}}
+#' evaluated on the supplied LD matrix \code{R}. This uses the same candidate
+#' beta path as \code{lassosum_rss()}, but scores each candidate directly from
+#' summary-statistics correlation \code{c} and LD, without requiring genotype.
 #'
 #' @details
-#' OTTERS compatibility requires source-aware model selection. When a PLINK1
-#' genotype source is available, this wrapper reproduces the old
-#' genotype-backed pseudovalidation selector. For PLINK2 sketch inputs, it
-#' performs pseudovalidation on the restored \code{U} matrix using
-#' \code{U_MIN/U_MAX}. If no genotype-backed source is provided, the function
-#' warns and falls back to \code{min(fbeta)}.
+#' The original lassosum pseudovalidation can be written as an LD quadratic
+#' score after centering and standardizing the reference matrix columns by the
+#' same per-variant scale:
+#' \deqn{\mathrm{score}(\beta) = \frac{c^T \beta}{\sqrt{\beta^T R \beta}}.}
+#' This implementation therefore uses the supplied LD matrix directly for
+#' selection. \code{min(fbeta)} is retained only as an explicit debug option.
 #'
 #' @param stat A list with \code{$b} (effect sizes) and \code{$n} (per-variant sample sizes).
 #' @param LD LD correlation matrix R (single matrix, NOT pre-shrunk).
 #' @param s Numeric vector of shrinkage parameters to search over. Default:
 #'   \code{c(0.2, 0.5, 0.9, 1.0)} following Mak et al (2017) and OTTERS.
-#' @param selection Selection strategy. Default \code{"auto"} uses
-#'   source-aware pseudovalidation when possible and falls back to
-#'   \code{"min_fbeta"} otherwise.
-#' @param genotype_source Optional PLINK prefix for genotype-backed or
-#'   sketch-genotype selection.
-#' @param genotype_type Source type for \code{genotype_source}: \code{"auto"},
-#'   \code{"plink1"}, \code{"plink2"}, or \code{"none"}.
-#' @param region Optional region string \code{"chr:start-end"} used to filter
-#'   genotype source variants.
-#' @param variant_info Optional variant annotation aligned to \code{stat} and
-#'   \code{LD}; must identify \code{chrom}, \code{pos}, \code{A1}, \code{A2}.
+#' @param selection Selection strategy. Default \code{"ld_quadratic"} uses
+#'   \eqn{c^T \beta / \sqrt{\beta^T R \beta}} on the supplied LD matrix.
+#'   \code{"min_fbeta"} is retained as an explicit alternative for debugging.
 #' @param ... Additional arguments passed to \code{lassosum_rss()}.
 #'
 #' @return A numeric vector of the posterior SNP coefficients at the best (s, lambda).
 #' @export
 lassosum_rss_weights <- function(stat, LD, s = c(0.2, 0.5, 0.9, 1.0),
-                                 selection = c("auto", "min_fbeta"),
-                                 genotype_source = NULL,
-                                 genotype_type = c("auto", "plink1", "plink2", "none"),
-                                 region = NULL,
-                                 variant_info = NULL,
+                                 selection = c("ld_quadratic", "min_fbeta"),
                                  ...) {
   selection <- match.arg(selection)
-  genotype_type <- match.arg(genotype_type)
   n <- median(stat$n)
   p <- nrow(LD)
   cor_input <- .lassosum_clamp_cor(.lassosum_cor_from_stat(stat, n = n, p = p))
@@ -1161,56 +973,10 @@ lassosum_rss_weights <- function(stat, LD, s = c(0.2, 0.5, 0.9, 1.0),
   }
   candidate_meta <- do.call(rbind, candidate_meta)
 
-  mode <- .lassosum_resolve_selection_mode(selection, genotype_source, genotype_type)
-  variant_info_prepped <- .lassosum_prepare_variant_info(variant_info, expected_n = p)
-  selector_result <- NULL
-
-  if (mode == "plink1" || mode == "plink2") {
-    selector_result <- tryCatch(
-      {
-        if (is.null(variant_info_prepped)) {
-          stop("variant_info is required for genotype-backed lassosum selection.")
-        }
-        if (is.null(genotype_source) || identical(genotype_source, "")) {
-          stop("genotype_source is required for genotype-backed lassosum selection.")
-        }
-        if (mode == "plink1") {
-          .lassosum_score_candidates_with_bfile(
-            genotype_source = genotype_source,
-            region = region,
-            variant_info = variant_info_prepped,
-            candidate_beta = candidate_beta,
-            cor_input = cor_input
-          )
-        } else {
-          .lassosum_score_candidates_with_sketch(
-            genotype_source = genotype_source,
-            region = region,
-            variant_info = variant_info_prepped,
-            candidate_beta = candidate_beta,
-            cor_input = cor_input
-          )
-        }
-      },
-      error = function(e) {
-        warning(
-          "lassosum_rss_weights could not use source-aware pseudovalidation (",
-          conditionMessage(e),
-          "); falling back to min(fbeta), which is not old-OTTERS-compatible."
-        )
-        NULL
-      }
-    )
-  }
-
-  if (is.null(selector_result)) {
-    if (mode == "min_fbeta" && selection == "auto") {
-      warning(
-        "lassosum_rss_weights has no genotype-backed source for pseudovalidation; ",
-        "falling back to min(fbeta), which is not old-OTTERS-compatible."
-      )
-    }
-    selector_result <- .lassosum_select_min_fbeta(candidate_beta, candidate_meta)
+  selector_result <- if (selection == "ld_quadratic") {
+    .lassosum_select_ld_quadratic(candidate_beta, cor_input, LD)
+  } else {
+    .lassosum_select_min_fbeta(candidate_beta, candidate_meta)
   }
 
   best_beta <- as.numeric(selector_result$beta)
diff --git a/inst/notes/otters_lassosum_selector_fix_pr.md b/inst/notes/otters_lassosum_selector_fix_pr.md
index b2df44a0..c0028688 100644
--- a/inst/notes/otters_lassosum_selector_fix_pr.md
+++ b/inst/notes/otters_lassosum_selector_fix_pr.md
@@ -1,69 +1,56 @@
-# OTTERS Lassosum Selector Fix
+# OTTERS Lassosum LD-Quadratic Selector Fix
 
 ## Summary
 
-This change fixes the immediate OTTERS lassosum regression, but the updated diagnostics also sharpen the longer-term conclusion: genotype is not fundamentally required for lassosum selection once the selector is written in LD form.
+This PR fixes the OTTERS lassosum regression by removing the old `min(fbeta)`
+selector from the default OTTERS path and replacing it with the LD-quadratic
+selector
 
-- The regression was caused by replacing old pseudovalidation-based model selection with `min(fbeta)`.
-- The compatibility patch in this PR restores old behavior conservatively:
-  - PLINK1 `bed/bim/fam` inputs use old genotype-backed `lassosum` pseudovalidation.
-  - PLINK2 sketch `pgen/pvar/psam` inputs use sketch-side pseudovalidation on restored `U`.
-  - inputs with no genotype-backed source still fall back to `min(fbeta)`, with an explicit warning that the fallback is not old-OTTERS-compatible.
-- The OTTERS wrapper also stops double-scaling lassosum input before it reaches the low-level solver.
-- New diagnostics after the patch show that the selector itself can be expressed as an LD quadratic form:
-  `score(beta) = (c^T beta) / sqrt(beta^T R beta)`.
-  On PLINK1-derived LD, that LD-only score reproduces old genotype-backed pseudovalidation almost exactly.
-
-## Example 206
-
-Fixture `chr1_206088859_208088859__ENSG00000123843` isolates the regression.
-
-- old saved vs old direct published `lassosum`: Pearson `1.0`, `0` opposite-sign variants
-- corrected-scaling + `min(fbeta)`: Pearson about `0.360`, `1309` opposite-sign variants
-
-Old published lassosum selected `s=0.2, lambda=1e-4`. Corrected-scaling plus `min(fbeta)` still selected `s=1, lambda=1e-4`.
+```text
+score(beta) = (c^T beta) / sqrt(beta^T R beta)
+```
 
-Implication:
+where:
 
-- this is not a grid-definition problem
-- both candidates are already on the old grid
-- the regression comes from changing the selection rule over the same grid
+- `c` is the aligned summary-statistics correlation vector
+- `R` is the supplied LD correlation matrix
+- `beta` is one candidate on the lassosum `(s, lambda)` path
 
-For this fixture, old pseudovalidation is a degenerate tie case, so old OTTERS selects the first maximum. Replacing that with `min(fbeta)` moves the selected model to the opposite end of the same grid.
+The important conclusion from the follow-up diagnostics is that genotype is not
+fundamentally required for lassosum selection once the selector is written in
+this LD form. The remaining sketch issue is about how `R` is constructed or
+standardized, not about the selector formula itself.
 
-## Example 161
+## What Failed
 
-Fixture `chr1_161023868_163023868__ENSG00000162745` is the informative selector case.
+Old OTTERS did not select lassosum models by `min(fbeta)`. It fit the beta path
+and then used lassosum pseudovalidation to choose the final `(s, lambda)`.
 
-On the common matched set of `4298` variants:
+The refactor changed that selector to `min(fbeta)`, and the OTTERS wrapper also
+double-scaled the lassosum input before it reached the low-level solver.
 
-- exact PLINK1 bfile pseudovalidation best candidate: `soft_lambda=0.041050213`
-- sketch-genotype pseudovalidation best candidate: `soft_lambda=0.021788613`
-- score agreement remains high:
-  - Pearson `0.9971`
-  - Spearman `0.9412`
-  - max absolute difference `0.0474`
+Those two changes were enough to move the selected model to a very different
+part of the same grid.
 
-When the sketch path is forced to use `sd_bfile`, the winning candidate returns to `soft_lambda=0.041050213`.
+## Example 206
 
-Implication:
+Fixture `chr1_206088859_208088859__ENSG00000123843` isolates the selector bug.
 
-- this is not a variant-overlap problem
-- this is a predictive normalization problem inside pseudovalidation
-- restored sketch `U` recovers the score surface closely
-- sketch-derived `sd` is not identical to old `lassosum:::sd.bfile()`
+- old saved vs old direct published `lassosum`: Pearson `1.0`, `0` opposite-sign variants
+- corrected-scaling + `min(fbeta)`: Pearson about `0.360`, `1309` opposite-sign variants
 
-So PLINK1 is the exact old-parity oracle in this PR, but it is no longer the conceptual end-state. The real conclusion is:
+On this fixture:
 
-- genotype is not fundamentally needed for lassosum selection
-- the LD quadratic selector is the mathematically correct formulation
-- the remaining mismatch is specific to how the sketch-side `R` / scaling is constructed, not to the selector formula itself
+- published lassosum selected `s = 0.2`, `lambda = 1e-4`
+- `min(fbeta)` selected `s = 1`, `lambda = 1e-4`
 
-## LD-Only Follow-up
+This is not a grid-definition problem. Both candidates are already on the old
+grid. The regression comes from changing the selection rule over the same
+candidate path.
 
-After the compatibility patch, we ran a direct genotype-vs-LD pseudovalidation diagnostic using the same candidate matrix and the same aligned `cor` vector.
+## Mathematical Rationale
 
-For each candidate `beta_k`, old pseudovalidation computes:
+Old pseudovalidation can be written as:
 
 ```text
 scaled_beta = beta / sd
@@ -71,119 +58,89 @@ pred        = X * scaled_beta
 score       = (c^T beta) / sqrt(Var(pred))
 ```
 
-After centering and standardizing the matrix columns by the same `sd`, this becomes:
+After centering and standardizing the columns of `X` by the same per-variant
+scale, this becomes:
 
 ```text
 score(beta) = (c^T beta) / sqrt(beta^T R beta)
 ```
 
-where `R` is the standardized LD correlation matrix.
+So the selector can be evaluated directly from summary-statistics correlation
+and LD, without using genotype explicitly. That is the selector implemented in
+this PR.
+
+## Validation
+
+We ran the LD-quadratic score directly against genotype-backed pseudovalidation
+on the same candidate matrix.
 
 ### PLINK1-derived LD
 
-The LD quadratic score matches genotype-backed pseudovalidation essentially exactly.
+The LD-quadratic score matches genotype pseudovalidation essentially exactly.
 
 - Fixture `161`:
-  - genotype best: `soft_lambda=0.041050213`, score `0.3922803981`
-  - LD-only PLINK1 best: `soft_lambda=0.041050213`, score `0.3922601179`
-  - Pearson `0.9999999`, same best candidate `TRUE`
+  - genotype best: `soft_lambda=0.041050213`
+  - LD-quadratic PLINK1 best: `soft_lambda=0.041050213`
+  - Pearson `0.9999999`
+  - same best candidate `TRUE`
 - Fixture `206`:
-  - genotype best: `soft_lambda=0.029906976`, score `0.3895932657`
-  - LD-only PLINK1 best: `soft_lambda=0.029906976`, score `0.3897370312`
-  - Pearson `1.0000000`, same best candidate `TRUE`
+  - genotype best: `soft_lambda=0.029906976`
+  - LD-quadratic PLINK1 best: `soft_lambda=0.029906976`
+  - Pearson `1.0000000`
+  - same best candidate `TRUE`
 
-This validates the LD-only quadratic selector. Genotype was only one way the old code evaluated the score; it is not inherently required once the standardized `R` is available.
+This validates the selector formula itself.
 
 ### Sketch-derived LD
 
-The same LD-only formulation,
-`score(beta) = (c^T beta) / sqrt(beta^T R_sketch beta)`,
-does **not** yet reproduce the genotype selector on the informative `161` fixture.
+The same LD-quadratic selector applied to the current sketch-derived `R` gives:
 
 - Fixture `161`:
-  - genotype best: `soft_lambda=0.041050213`
-  - LD-only sketch best: `soft_lambda=0.021788613`
-  - Pearson `0.9971044`, same best candidate `FALSE`
+  - sketch-LD quadratic best: `soft_lambda=0.021788613`
+  - Pearson vs genotype pseudovalidation `0.9971044`
+  - same best candidate `FALSE`
 - Fixture `206`:
-  - genotype best: `soft_lambda=0.029906976`
-  - LD-only sketch best: `soft_lambda=0.029906976`
-  - Pearson `0.9999503`, same best candidate `TRUE`
+  - sketch-LD quadratic best: `soft_lambda=0.029906976`
+  - Pearson vs genotype pseudovalidation `0.9999503`
+  - same best candidate `TRUE`
 
-So the unresolved problem is no longer the selector formula. It is the sketch-side construction of a standardized `R` that matches the PLINK1/genotype-backed path.
+That result is consistent with the sketch sample-matrix diagnostic: the same
+winner shift appears there too. So the remaining mismatch is not between
+"sample-matrix pseudovalidation" and "quadratic LD scoring". It is between the
+current sketch representation and the PLINK1/genotype-backed standardized LD
+path.
 
-## What Is Fixed
+## What This PR Changes
 
 ## `R/regularized_regression.R`
 
-- `lassosum_rss_weights()` no longer defaults every OTTERS run to `min(fbeta)`.
-- It now selects by source:
-  - PLINK1: published `lassosum` pseudovalidation
-  - PLINK2 sketch: restored-`U` pseudovalidation
-  - no genotype-backed source: warning + `min(fbeta)` fallback
-- It preserves old first-max tie behavior for pseudovalidation selectors.
-- It passes lassosum input in correlation units only once before the low-level solver converts to `z / sqrt(n)`.
-
-What actually failed:
-
-- old OTTERS-compatible selection was replaced with `min(fbeta)`
-- the OTTERS wrapper also double-scaled lassosum input
-
-Why it worked earlier:
-
-- old OTTERS used published `lassosum.pipeline(..., test.bfile = bfile)` followed by `pseudovalidate(out)`
-
-Why it failed here:
-
-- the refactor changed the selector and the scaling contract
-- fixture `206` exposed the selector regression clearly because it is a tied pseudovalidation table where first-max and `min(fbeta)` pick different candidates
-
-Classification:
-
-- selector replacement: real behavioral regression
-- double scaling: real implementation bug
-- fixture `206` tie case: edge-case trigger that exposed the regression cleanly
-
-Compatibility:
-
-- exact old behavior is restored for PLINK1 genotype-backed OTTERS runs
-- PLINK2 sketch inputs now use a source-aware selector instead of the generic fallback
-- pure precomputed-LD inputs remain on the fallback path and now say so explicitly
-
-Important clarification from the new diagnostics:
-
-- this PR should not be read as claiming genotype is necessary for lassosum
-- it should be read as a compatibility patch that restores the old selector while the cleaner LD-only selector is validated
-- the LD-only selector is now validated on PLINK1-derived LD; the remaining work is to make the sketch-derived LD path produce the same standardized `R`
+- fixes the OTTERS lassosum scaling contract so correlation input is only
+  converted once before the low-level solver
+- removes genotype-format-specific selector dispatch from
+  `lassosum_rss_weights()`
+- makes the default selector `ld_quadratic`
+- keeps `min(fbeta)` only as an explicit debug option
+- preserves first-max tie behavior for equal selector scores
 
 ## `R/otters.R`
 
-- `otters_weights()` now forwards aligned `chrom/pos/A1/A2` variant metadata to lassosum when available.
-- This restores the variant-alignment information needed for source-aware selection.
-
-Classification:
-
-- wrapper or interface drift
+- passes correlation-scale statistics into lassosum explicitly via
+  `stat$cor` / `stat$z`
+- removes the temporary genotype-source / variant-metadata plumbing that was
+  only needed for the earlier compatibility patch
 
-## Tests
+## Scope
 
-Focused validation for this change:
+This PR only changes the OTTERS lassosum selection path.
 
-- `tests/testthat/test_rr_lassosum.R`
-- `tests/testthat/test_rr_dispatch.R`
-- `tests/testthat/test_otters.R`
+- It does not change PRS-CS or SDPR behavior.
+- It does not claim the current sketch-derived LD is already identical to the
+  PLINK1/genotype-backed path.
+- It does claim that the selector should be implemented in LD form, and that
+  `min(fbeta)` was the wrong default for OTTERS parity.
 
-Local evidence paths used during debugging:
+## Evidence Paths
 
 - `temp_reference/otters_regression/lassosum_oldR_direct_206/`
 - `temp_reference/otters_regression/lassosum_forensics_206/`
 - `temp_reference/otters_regression/pseudovalidation_sketch_vs_bfile/`
-
-## Narrow Compatibility Note
-
-This change only alters the OTTERS lassosum selection path.
-
-- It restores exact old behavior when a legacy PLINK1 genotype source is available.
-- It makes PLINK2 sketch inputs use a sketch-side selector instead of the old generic fallback.
-- It does not change PRS-CS or SDPR behavior.
-
-The new diagnostics imply that a future cleanup should replace the current genotype/sketch split with one format-generic LD-only selector once the sketch-side `R` construction is brought into line with the PLINK1 path.
diff --git a/tests/testthat/test_otters.R b/tests/testthat/test_otters.R
index d31ce2fd..b8e8de3e 100644
--- a/tests/testthat/test_otters.R
+++ b/tests/testthat/test_otters.R
@@ -9,7 +9,7 @@ test_that("otters_weights returns named list of weight vectors", {
   R <- diag(p)
   sumstats <- data.frame(z = z)
   result <- otters_weights(sumstats, R, n,
-    methods = list(lassosum_rss = list(selection = "min_fbeta")),
+    methods = list(lassosum_rss = list()),
     p_thresholds = c(0.05)
   )
   expect_type(result, "list")
@@ -26,7 +26,7 @@ test_that("otters_weights computes z from beta/se if z missing", {
   sumstats <- data.frame(beta = rnorm(p, sd = 0.1), se = rep(0.05, p))
   R <- diag(p)
   result <- otters_weights(sumstats, R, n,
-    methods = list(lassosum_rss = list(selection = "min_fbeta")),
+    methods = list(lassosum_rss = list()),
     p_thresholds = c(0.05)
   )
   expect_true("lassosum_rss" %in% names(result))
@@ -75,7 +75,7 @@ test_that("otters_weights with multiple methods returns all", {
   sumstats <- data.frame(z = z)
   result <- otters_weights(sumstats, R, n,
     methods = list(
-      lassosum_rss = list(selection = "min_fbeta"),
+      lassosum_rss = list(),
       prs_cs = list(n_iter = 50, n_burnin = 10, thin = 2, seed = 42)
     ),
     p_thresholds = c(0.001, 0.05)
@@ -87,23 +87,16 @@ test_that("otters_weights with multiple methods returns all", {
   }
 })
 
-test_that("otters_weights forwards variant_info only to lassosum", {
+test_that("otters_weights passes correlation-scale stat fields to lassosum", {
   p <- 5
   n <- 100
   z <- rnorm(p)
   R <- diag(p)
-  sumstats <- data.frame(
-    chrom = rep(1, p),
-    pos = seq_len(p),
-    A1 = c("A", "C", "G", "T", "A"),
-    A2 = c("G", "T", "A", "C", "G"),
-    z = z,
-    stringsAsFactors = FALSE
-  )
+  sumstats <- data.frame(z = z)
   captured <- new.env(parent = emptyenv())
   local_mocked_bindings(
-    lassosum_rss_weights = function(stat, LD, variant_info = NULL, ...) {
-      captured$lassosum_variant_info <- variant_info
+    lassosum_rss_weights = function(stat, LD, ...) {
+      captured$lassosum_stat <- stat
       captured$lassosum_dots <- list(...)
       rep(0.1, nrow(LD))
     },
@@ -116,7 +109,7 @@ test_that("otters_weights forwards variant_info only to lassosum", {
   result <- otters_weights(
     sumstats, R, n,
     methods = list(
-      lassosum_rss = list(selection = "min_fbeta"),
+      lassosum_rss = list(),
       prs_cs = list(phi = 1e-4)
     ),
     p_thresholds = NULL,
@@ -125,8 +118,9 @@ test_that("otters_weights forwards variant_info only to lassosum", {
 
   expect_equal(result$lassosum_rss, rep(0.1, p))
   expect_equal(result$prs_cs, rep(0.2, p))
-  expect_equal(captured$lassosum_variant_info, sumstats[, c("chrom", "pos", "A1", "A2")])
-  expect_null(captured$prs_cs_dots$variant_info)
+  expect_equal(captured$lassosum_stat$cor, z / sqrt(n))
+  expect_equal(captured$lassosum_stat$z, z)
+  expect_equal(captured$lassosum_stat$b, z / sqrt(n))
 })
 
 # ---- otters_association ----
@@ -201,7 +195,7 @@ test_that("otters_weights + otters_association end-to-end on simulated data", {
   # Stage I: train weights
   sumstats <- data.frame(z = eqtl_z)
   weights <- otters_weights(sumstats, R, n_eqtl,
-    methods = list(lassosum_rss = list(selection = "min_fbeta")),
+    methods = list(lassosum_rss = list()),
     p_thresholds = c(0.05)
   )
   expect_true(length(weights) >= 2)
diff --git a/tests/testthat/test_rr_dispatch.R b/tests/testthat/test_rr_dispatch.R
index a17e44e9..2b2a4223 100644
--- a/tests/testthat/test_rr_dispatch.R
+++ b/tests/testthat/test_rr_dispatch.R
@@ -90,106 +90,66 @@ test_that("lassosum_rss_weights dispatches to lassosum_rss once per s value", {
   expect_false(identical(call_log$calls[[1]]$LD, call_log$calls[[2]]$LD))
 })
 
-test_that("lassosum_rss_weights auto mode dispatches to PLINK1 pseudovalidation", {
+test_that("lassosum_rss_weights defaults to LD-quadratic selection", {
   p <- 4
-  stat <- list(b = c(0.1, -0.2, 0.05, 0.03), n = rep(100, p))
+  stat <- list(cor = c(0.4, 0.1, 0, 0), n = rep(100, p))
   R <- diag(p)
-  variant_info <- data.frame(
-    chrom = 1,
-    pos = 1:p,
-    A1 = c("A", "C", "G", "T"),
-    A2 = c("G", "T", "A", "C"),
-    stringsAsFactors = FALSE
-  )
+  R[1, 2] <- 0.5
+  R[2, 1] <- 0.5
+  expected <- c(1, 0, 0, 0)
 
   local_mocked_bindings(
-    has_plink1_files = function(prefix) TRUE,
-    has_plink2_files = function(prefix) FALSE,
     lassosum_rss = function(bhat, LD, n, ...) {
       list(
-        beta = matrix(seq_len(length(bhat) * 2), nrow = length(bhat)),
+        beta = cbind(expected, c(1, 1, 0, 0)),
         lambda = c(0.05, 0.01),
-        fbeta = c(2, 1)
+        fbeta = c(5, 1)
       )
-    },
-    .lassosum_score_candidates_with_bfile = function(...) {
-      list(beta = rep(0.7, p), index = 2L, mode = "plink1_pseudovalidation")
     }
   )
 
-  result <- lassosum_rss_weights(
-    stat = stat,
-    LD = R,
-    s = 0.5,
-    selection = "auto",
-    genotype_source = "/fake/plink1",
-    variant_info = variant_info
-  )
-
-  expect_equal(c(result), rep(0.7, p))
-  expect_equal(unname(attr(result, "lassosum_selection")["mode"]), "plink1_pseudovalidation")
+  result <- lassosum_rss_weights(stat = stat, LD = R, s = 0.5)
+  expect_equal(c(result), expected)
+  expect_equal(unname(attr(result, "lassosum_selection")["mode"]), "ld_quadratic")
 })
 
-test_that("lassosum_rss_weights auto mode dispatches to PLINK2 sketch pseudovalidation", {
-  p <- 4
-  stat <- list(b = c(0.1, -0.2, 0.05, 0.03), n = rep(100, p))
+test_that("lassosum_rss_weights uses first-max tie behavior for LD-quadratic selection", {
+  p <- 3
+  stat <- list(cor = c(0.3, 0.3, 0), n = rep(100, p))
   R <- diag(p)
-  variant_info <- data.frame(
-    chrom = 1,
-    pos = 1:p,
-    A1 = c("A", "C", "G", "T"),
-    A2 = c("G", "T", "A", "C"),
-    stringsAsFactors = FALSE
-  )
 
   local_mocked_bindings(
-    has_plink1_files = function(prefix) FALSE,
-    has_plink2_files = function(prefix) TRUE,
     lassosum_rss = function(bhat, LD, n, ...) {
       list(
-        beta = matrix(seq_len(length(bhat) * 2), nrow = length(bhat)),
+        beta = cbind(c(1, 0, 0), c(0, 1, 0)),
         lambda = c(0.05, 0.01),
         fbeta = c(2, 1)
       )
-    },
-    .lassosum_score_candidates_with_sketch = function(...) {
-      list(beta = rep(0.4, p), index = 1L, mode = "plink2_sketch_pseudovalidation")
     }
   )
 
-  result <- lassosum_rss_weights(
-    stat = stat,
-    LD = R,
-    s = 0.5,
-    selection = "auto",
-    genotype_source = "/fake/plink2",
-    variant_info = variant_info
-  )
-
-  expect_equal(c(result), rep(0.4, p))
-  expect_equal(unname(attr(result, "lassosum_selection")["mode"]), "plink2_sketch_pseudovalidation")
+  result <- lassosum_rss_weights(stat = stat, LD = R, s = 0.5)
+  expect_equal(c(result), c(1, 0, 0))
+  expect_equal(unname(attr(result, "lassosum_selection")["mode"]), "ld_quadratic")
 })
 
-test_that("lassosum_rss_weights warns and falls back to min(fbeta) without genotype source", {
+test_that("lassosum_rss_weights still supports explicit min(fbeta)", {
   p <- 4
-  stat <- list(b = c(0.1, -0.2, 0.05, 0.03), n = rep(100, p))
+  stat <- list(cor = c(0.4, 0.1, 0, 0), n = rep(100, p))
   R <- diag(p)
-  expected <- rep(0.2, p)
+  expected <- c(1, 1, 0, 0)
 
   local_mocked_bindings(
     lassosum_rss = function(bhat, LD, n, ...) {
       list(
-        beta = cbind(rep(0, length(bhat)), expected),
+        beta = cbind(c(1, 0, 0, 0), expected),
         lambda = c(0.05, 0.01),
         fbeta = c(5, 1)
       )
     }
   )
 
-  result <- expect_warning(
-    lassosum_rss_weights(stat = stat, LD = R, s = 0.5, selection = "auto"),
-    "not old-OTTERS-compatible"
-  )
+  result <- lassosum_rss_weights(stat = stat, LD = R, s = 0.5, selection = "min_fbeta")
   expect_equal(c(result), expected)
   expect_equal(unname(attr(result, "lassosum_selection")["mode"]), "min_fbeta")
 })
diff --git a/tests/testthat/test_rr_lassosum.R b/tests/testthat/test_rr_lassosum.R
index fda14943..27f74f5b 100644
--- a/tests/testthat/test_rr_lassosum.R
+++ b/tests/testthat/test_rr_lassosum.R
@@ -86,11 +86,11 @@ test_that("lassosum_rss_weights calls lassosum_rss and returns beta_est", {
       )
     }
   )
-  result <- lassosum_rss_weights(stat = stat, LD = R, s = 0.5, selection = "min_fbeta")
+  result <- lassosum_rss_weights(stat = stat, LD = R, s = 0.5)
   expect_equal(length(result), p)
   expect_true(is.numeric(result))
   expect_equal(c(result), expected)
-  expect_equal(unname(attr(result, "lassosum_selection")["mode"]), "min_fbeta")
+  expect_equal(unname(attr(result, "lassosum_selection")["mode"]), "ld_quadratic")
 })
 
 test_that("lassosum_rss_weights clamps correlation input before scaling", {

From e01db0450051b42a0bd98a84faf5fab809dd1705 Mon Sep 17 00:00:00 2001
From: Hao Sun <sunh14@lc06e01.chimera.hpc.mssm.edu>
Date: Fri, 24 Apr 2026 00:00:49 -0400
Subject: [PATCH 4/5] Clarify source-matched lassosum validation note

---
 inst/notes/otters_lassosum_selector_fix_pr.md | 41 +++++++++----------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/inst/notes/otters_lassosum_selector_fix_pr.md b/inst/notes/otters_lassosum_selector_fix_pr.md
index c0028688..5b6bb209 100644
--- a/inst/notes/otters_lassosum_selector_fix_pr.md
+++ b/inst/notes/otters_lassosum_selector_fix_pr.md
@@ -71,44 +71,43 @@ this PR.
 
 ## Validation
 
-We ran the LD-quadratic score directly against genotype-backed pseudovalidation
-on the same candidate matrix.
+We validated the LD-quadratic score against the corresponding source-matched
+sample-matrix pseudovalidation on the same candidate matrix.
 
-### PLINK1-derived LD
+### PLINK1 source: genotype matrix vs LD-quadratic
 
-The LD-quadratic score matches genotype pseudovalidation essentially exactly.
+The LD-quadratic score from PLINK1-derived LD matches PLINK1 genotype
+pseudovalidation essentially exactly.
 
 - Fixture `161`:
-  - genotype best: `soft_lambda=0.041050213`
-  - LD-quadratic PLINK1 best: `soft_lambda=0.041050213`
+  - PLINK1 genotype best: `soft_lambda=0.041050213`
+  - PLINK1 LD-quadratic best: `soft_lambda=0.041050213`
   - Pearson `0.9999999`
   - same best candidate `TRUE`
 - Fixture `206`:
-  - genotype best: `soft_lambda=0.029906976`
-  - LD-quadratic PLINK1 best: `soft_lambda=0.029906976`
+  - PLINK1 genotype best: `soft_lambda=0.029906976`
+  - PLINK1 LD-quadratic best: `soft_lambda=0.029906976`
   - Pearson `1.0000000`
   - same best candidate `TRUE`
 
 This validates the selector formula itself.
 
-### Sketch-derived LD
+### Sketch source: sample matrix vs LD-quadratic
 
-The same LD-quadratic selector applied to the current sketch-derived `R` gives:
+For the sketch source, the sample-matrix pseudovalidation and the LD-quadratic
+score are the same numeric object once both are built from the same restored
+sketch matrix and the same column standardization.
 
 - Fixture `161`:
-  - sketch-LD quadratic best: `soft_lambda=0.021788613`
-  - Pearson vs genotype pseudovalidation `0.9971044`
-  - same best candidate `FALSE`
-- Fixture `206`:
-  - sketch-LD quadratic best: `soft_lambda=0.029906976`
-  - Pearson vs genotype pseudovalidation `0.9999503`
+  - sketch sample-matrix best: `soft_lambda=0.021788613`
+  - sketch LD-quadratic best: `soft_lambda=0.021788613`
+  - Pearson `1.0`
+  - max absolute difference `< 1e-15`
   - same best candidate `TRUE`
 
-That result is consistent with the sketch sample-matrix diagnostic: the same
-winner shift appears there too. So the remaining mismatch is not between
-"sample-matrix pseudovalidation" and "quadratic LD scoring". It is between the
-current sketch representation and the PLINK1/genotype-backed standardized LD
-path.
+So the remaining mismatch is not between "sample-matrix pseudovalidation" and
+"quadratic LD scoring". It is between the current sketch representation and the
+PLINK1/genotype-backed standardized LD path.
 
 ## What This PR Changes
 

From 9e93ee9a50439c066e5095245f54149c7a22a882 Mon Sep 17 00:00:00 2001
From: danielnachun <danielnachun@users.noreply.github.com>
Date: Fri, 24 Apr 2026 20:55:01 +0000
Subject: [PATCH 5/5] Update documentation

---
 man/lassosum_rss_weights.Rd | 32 +++++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/man/lassosum_rss_weights.Rd b/man/lassosum_rss_weights.Rd
index 9bb71070..2d1cc2a3 100644
--- a/man/lassosum_rss_weights.Rd
+++ b/man/lassosum_rss_weights.Rd
@@ -4,7 +4,13 @@
 \alias{lassosum_rss_weights}
 \title{Extract weights from lassosum_rss with shrinkage grid search}
 \usage{
-lassosum_rss_weights(stat, LD, s = c(0.2, 0.5, 0.9, 1), ...)
+lassosum_rss_weights(
+  stat,
+  LD,
+  s = c(0.2, 0.5, 0.9, 1),
+  selection = c("ld_quadratic", "min_fbeta"),
+  ...
+)
 }
 \arguments{
 \item{stat}{A list with \code{$b} (effect sizes) and \code{$n} (per-variant sample sizes).}
@@ -14,6 +20,10 @@ lassosum_rss_weights(stat, LD, s = c(0.2, 0.5, 0.9, 1), ...)
 \item{s}{Numeric vector of shrinkage parameters to search over. Default:
 \code{c(0.2, 0.5, 0.9, 1.0)} following Mak et al (2017) and OTTERS.}
 
+\item{selection}{Selection strategy. Default \code{"ld_quadratic"} uses
+\eqn{c^T \beta / \sqrt{\beta^T R \beta}} on the supplied LD matrix.
+\code{"min_fbeta"} is retained as an explicit alternative for debugging.}
+
 \item{...}{Additional arguments passed to \code{lassosum_rss()}.}
 }
 \value{
@@ -23,14 +33,18 @@ A numeric vector of the posterior SNP coefficients at the best (s, lambda).
 Searches over a grid of shrinkage parameters \code{s} (default:
 \code{c(0.2, 0.5, 0.9, 1.0)}, matching the original lassosum and OTTERS).
 For each \code{s}, the LD matrix is shrunk as \code{(1-s)*R + s*I}, then
-\code{lassosum_rss()} is called across the lambda path. The best
-\code{(s, lambda)} combination is selected by the lowest objective value.
+\code{lassosum_rss()} is called across the lambda path. Candidate selection
+defaults to the LD-only quadratic pseudovalidation score
+\deqn{\frac{c^T \beta}{\sqrt{\beta^T R \beta}}}
+evaluated on the supplied LD matrix \code{R}. This uses the same candidate
+beta path as \code{lassosum_rss()}, but scores each candidate directly from
+summary-statistics correlation \code{c} and LD, without requiring genotype.
 }
 \details{
-Model selection uses \code{min(fbeta)} (penalized objective) rather than
-the pseudovalidation approach from the original lassosum R package. Empirical
-comparison over 20 random trials (n=300, p=50, 3 causal) shows no systematic
-advantage for either method: pseudovalidation won 4/20, min(fbeta) won 6/20,
-tied 10/20. The shrinkage grid over \code{s} provides the primary regularization;
-lambda selection within each \code{s} has minimal impact.
+The original lassosum pseudovalidation can be written as an LD quadratic
+score after centering and standardizing the reference matrix columns by the
+same per-variant scale:
+\deqn{\mathrm{score}(\beta) = \frac{c^T \beta}{\sqrt{\beta^T R \beta}}.}
+This implementation therefore uses the supplied LD matrix directly for
+selection. \code{min(fbeta)} is retained only as an explicit debug option.
 }