diff --git a/src/commands/meta_staar.rs b/src/commands/meta_staar.rs index ad4276a..5e89404 100644 --- a/src/commands/meta_staar.rs +++ b/src/commands/meta_staar.rs @@ -44,14 +44,6 @@ pub fn build_config( loci.display() ))); } - if matches!(conditional_model, crate::cli::ConditionalModel::Heterogeneous) { - return Err(CohortError::Input( - "--conditional-model heterogeneous requires per-study U vectors \ - which --emit-sumstats does not yet persist. Use --conditional-model \ - homogeneous for now." - .into(), - )); - } } let mask_categories = crate::commands::parse_mask_categories(&masks)?; diff --git a/src/staar/carrier/sparse_score.rs b/src/staar/carrier/sparse_score.rs index 5d27520..9681474 100644 --- a/src/staar/carrier/sparse_score.rs +++ b/src/staar/carrier/sparse_score.rs @@ -405,6 +405,7 @@ pub(crate) fn null_model_from_analysis(analysis: &AnalysisVectors) -> NullModel // a dense G and a non-mixed-model NullModel. Kinship-aware analyses // route through `score_gene_sparse_kinship` directly. kinship: None, + scang: None, } } diff --git a/src/staar/meta.rs b/src/staar/meta.rs index c187c47..0d9b499 100644 --- a/src/staar/meta.rs +++ b/src/staar/meta.rs @@ -406,7 +406,10 @@ pub fn merge_chromosome( bool_or({ccre_p}) AS {ccre_p}, \ bool_or({ccre_e}) AS {ccre_e}, \ {weight_aggs}, \ - CAST(array_agg(named_struct('s', study_idx, 'seg', segment_id)) AS VARCHAR) AS study_segs \ + CAST(array_agg(named_struct('s', study_idx, 'seg', segment_id)) AS VARCHAR) AS study_segs, \ + CAST(array_agg(named_struct('s', study_idx, \ + 'u', CASE WHEN {ref_a} <= {alt_a} THEN u_stat ELSE -u_stat END)) \ + AS VARCHAR) AS study_us \ FROM _study_variants \ WHERE {maf} < {maf_cutoff} \ GROUP BY {pos}, \ @@ -428,7 +431,7 @@ pub fn merge_chromosome( cadd_phred_raw, {revel}, {msp}, {gh}, \ {cage_p}, {cage_e}, {ccre_p}, {ccre_e}, \ {weight_select}, \ - study_segs \ + study_segs, study_us \ FROM _meta_variants ORDER BY {pos}", pos = Col::Position, ref_a = Col::RefAllele, @@ -512,6 +515,7 @@ pub fn merge_chromosome( w_arrs.push(f64_col(17 + i)?); } let segs_arr = str_col(28)?; + let study_u_arr = str_col(29)?; for i in 0..n { let mut weights = [0.0f64; 11]; @@ -521,6 +525,7 @@ pub fn merge_chromosome( let cadd_phred = f64_or(cadd_raw_arr, i, 0.0); let study_segments = parse_study_segments(str_or(segs_arr, i, "")); + let u_study = parse_study_us(str_or(study_u_arr, i, "")); let mac_total = i64_or(mac_arr, i, 0); let n_total = i64_or(n_obs_arr, i, 0); @@ -559,6 +564,7 @@ pub fn merge_chromosome( mac_total, n_total, study_segments, + u_study, }); } } @@ -671,6 +677,7 @@ pub fn meta_score_gene( n_variants: m as u32, cumulative_mac: cmac as u32, staar: sr, + emthr: f64::NAN, }, burden_beta, burden_se, @@ -750,6 +757,34 @@ fn parse_study_segments(s: &str) -> Vec<(usize, i32)> { result } +/// Parse DuckDB's stringified `array_agg(named_struct('s',..,'u',..))` into +/// `(study_idx, signed_u)` pairs. Mirrors `parse_study_segments` byte-for-byte +/// save for the 'u' value being a float rather than an integer segment id. +fn parse_study_us(s: &str) -> Vec<(usize, f64)> { + let mut result = Vec::new(); + for part in s.split('{') { + let part = + part.trim_matches(|c: char| c == '[' || c == ']' || c == ',' || c == ' ' || c == '}'); + if part.is_empty() { + continue; + } + let mut study: Option = None; + let mut u: Option = None; + for kv in part.split(',') { + let kv = kv.trim(); + if let Some(val) = kv.strip_prefix("s:").or_else(|| kv.strip_prefix("s :")) { + study = val.trim().parse().ok(); + } else if let Some(val) = kv.strip_prefix("u:").or_else(|| kv.strip_prefix("u :")) { + u = val.trim().parse().ok(); + } + } + if let (Some(s), Some(u)) = (study, u) { + result.push((s, u)); + } + } + result +} + const SEGMENT_BP: u32 = 500_000; const MAX_SEGMENT_VARIANTS: usize = 2000; @@ -1215,27 +1250,23 @@ pub fn parse_known_loci_file( } /// Conditional meta-analysis: condition gene-level U/K on known loci -/// before running STAAR tests. -/// -/// Homogeneous model: condition the merged (cross-study) U and K. -/// Heterogeneous model: condition per-study U and K before merging. -/// -/// The conditioning step uses Schur complement: -/// U_cond = U_t - K_tc * K_cc^{-1} * U_c -/// K_cond = K_tt - K_tc * K_cc^{-1} * K_ct -/// +/// before running STAAR tests. Schur complement: +/// U_cond = U_t - K_tc · K_cc⁻¹ · U_c +/// K_cond = K_tt - K_tc · K_cc⁻¹ · K_ct /// where t = test (gene) variants, c = conditioning (known loci) variants. -/// Conditional meta-scoring uses the homogeneous model: condition the -/// merged (cross-study) U and K on known-loci variants via Schur complement. -/// The heterogeneous model (per-study conditioning) is rejected at config -/// time because --emit-sumstats does not yet persist per-study U vectors. +/// +/// Homogeneous model sums U and K across studies before the Schur solve. +/// Heterogeneous model does the Schur solve inside each study first and +/// sums per-study (U_cond_i, K_cond_i) at the end, matching MetaSTAAR +/// R/MetaSTAAR_merge_cond.R:391-404 and tolerating per-study variation in +/// covariate-adjusted residual variance. pub fn meta_score_gene_conditional( group: &MaskGroup, meta_variants: &[MetaVariant], studies: &[StudyHandle], segment_cache: &HashMap<(usize, i32), SegmentCov>, known_loci_indices: &[usize], - _heterogeneous: bool, + heterogeneous: bool, ) -> Option { let gene_indices: Vec = group .variant_indices @@ -1264,7 +1295,6 @@ pub fn meta_score_gene_conditional( let m_t = gene_indices.len(); let m_c = cond_indices.len(); - // Build combined keys: [gene variants | conditioning variants]. let combined: Vec = gene_indices .iter() .chain(cond_indices.iter()) @@ -1272,12 +1302,6 @@ pub fn meta_score_gene_conditional( .collect(); let m_all = combined.len(); - // Homogeneous: condition merged U/K. - let mut u_all = Mat::zeros(m_all, 1); - for (local, &gi) in combined.iter().enumerate() { - u_all[(local, 0)] = meta_variants[gi].u_meta; - } - let keys: Vec<(u32, &str, &str)> = combined .iter() .map(|&gi| { @@ -1289,6 +1313,46 @@ pub fn meta_score_gene_conditional( }) .collect(); + if heterogeneous { + // Mirrors MetaSTAAR R/MetaSTAAR_merge_cond.R:391-404. + // Each study conditions its own (U_i, K_i) via Schur complement; + // the per-study conditional pair is summed across studies. Studies + // with no overlap on a variant contribute zero U and zero K for + // that row / column, so they drop out of the Schur solve naturally. + let mut u_cond_sum = Mat::zeros(m_t, 1); + let mut k_cond_sum = Mat::zeros(m_t, m_t); + + for study_idx in 0..studies.len() { + let (u_study, cov_study) = build_study_u_cov( + study_idx, + &combined, + meta_variants, + segment_cache, + &keys, + ); + + let (u_t_i, k_tt_i, u_c_i, k_cc_i, k_tc_i) = + partition_u_cov(&u_study, &cov_study, m_t, m_c); + let (u_cond_i, k_cond_i) = + schur_condition(&u_t_i, &k_tt_i, &u_c_i, &k_cc_i, &k_tc_i); + + for i in 0..m_t { + u_cond_sum[(i, 0)] += u_cond_i[(i, 0)]; + for j in 0..m_t { + k_cond_sum[(i, j)] += k_cond_i[(i, j)]; + } + } + } + + return finish_conditional(&gene_indices, meta_variants, &u_cond_sum, &k_cond_sum, group); + } + + // Homogeneous: condition merged U/K. + let mut u_all = Mat::zeros(m_all, 1); + for (local, &gi) in combined.iter().enumerate() { + u_all[(local, 0)] = meta_variants[gi].u_meta; + } + let mut cov_all = Mat::zeros(m_all, m_all); for study_idx in 0..studies.len() { let mut needed_segments: std::collections::HashSet = @@ -1318,6 +1382,50 @@ pub fn meta_score_gene_conditional( finish_conditional(&gene_indices, meta_variants, &u_cond, &k_cond, group) } +/// Build one study's (U, K) across the combined variant list. Variants not +/// present in the study contribute zero; segments absent from the cache +/// contribute zero (same fallback as the homogeneous accumulator). +fn build_study_u_cov( + study_idx: usize, + combined: &[usize], + meta_variants: &[MetaVariant], + segment_cache: &HashMap<(usize, i32), SegmentCov>, + keys: &[(u32, &str, &str)], +) -> (Mat, Mat) { + let m_all = combined.len(); + let mut u = Mat::zeros(m_all, 1); + for (local, &gi) in combined.iter().enumerate() { + for &(s, val) in &meta_variants[gi].u_study { + if s == study_idx { + u[(local, 0)] = val; + break; + } + } + } + + let mut cov = Mat::zeros(m_all, m_all); + let mut needed: std::collections::HashSet = std::collections::HashSet::new(); + for &gi in combined { + for &(s, seg_id) in &meta_variants[gi].study_segments { + if s == study_idx { + needed.insert(seg_id); + } + } + } + for seg_id in needed { + if let Some(seg) = segment_cache.get(&(study_idx, seg_id)) { + let sub = seg.extract_submatrix(keys); + for i in 0..m_all { + for j in 0..m_all { + cov[(i, j)] += sub[(i, j)]; + } + } + } + } + + (u, cov) +} + /// Partition combined U and K into test (t) and conditioning (c) blocks. #[allow(clippy::type_complexity)] fn partition_u_cov( @@ -1442,6 +1550,7 @@ fn finish_conditional( n_variants: m_t as u32, cumulative_mac: cmac as u32, staar: sr, + emthr: f64::NAN, }, burden_beta, burden_se, @@ -1562,6 +1671,7 @@ mod tests { mac_total: (2.0 * mafs[i] * n as f64).round() as i64, n_total: n as i64, study_segments: vec![(0, 0)], + u_study: vec![(0, u[(i, 0)])], }) .collect(); @@ -1674,6 +1784,7 @@ mod tests { mac_total: (2.0 * mafs[i] * n as f64).round() as i64, n_total: n as i64, study_segments: vec![(0, 10), (1, 20)], + u_study: vec![(0, half_u[(i, 0)]), (1, half_u[(i, 0)])], }) .collect(); diff --git a/src/staar/mod.rs b/src/staar/mod.rs index 3fef1d4..a9d2afd 100644 --- a/src/staar/mod.rs +++ b/src/staar/mod.rs @@ -13,6 +13,7 @@ pub mod multi; pub mod output; pub mod pipeline; pub mod run_manifest; +pub mod scang; pub mod score; pub mod scoring; pub mod stats; @@ -152,4 +153,9 @@ pub struct GeneResult { pub n_variants: u32, pub cumulative_mac: u32, pub staar: score::StaarResult, + /// SCANG-O empirical −log10(p) threshold at α = 0.05, NaN otherwise. + /// Emitted alongside per-window p-values so operators can cross-check + /// `-log10(p) > emthr` matches the R `SCANG_O_res$th0` gate. See + /// `crate::staar::scang::chrom_threshold` and SCANG R/SCANG.r:181-205. + pub emthr: f64, } diff --git a/src/staar/model.rs b/src/staar/model.rs index 92c5ea3..e38a314 100644 --- a/src/staar/model.rs +++ b/src/staar/model.rs @@ -505,6 +505,9 @@ pub struct NullModel { /// Kinship-aware variance components and projection. Set when `--kinship` /// is in use; the score path then dispatches to the kinship-aware kernel. pub kinship: Option, + /// SCANG-side state populated lazily once a run needs Monte Carlo + /// thresholds. See `crate::staar::scang::ScangExt`. + pub scang: Option, } impl NullModel { @@ -603,6 +606,7 @@ pub fn fit_glm(y: &Mat, x: &Mat) -> NullModel { fitted_values: None, working_weights: None, kinship: None, + scang: None, } } @@ -704,6 +708,7 @@ pub fn fit_logistic(y: &Mat, x: &Mat, max_iter: usize) -> NullModel { fitted_values: Some(fitted), working_weights: Some(w_final), kinship: None, + scang: None, } } diff --git a/src/staar/output.rs b/src/staar/output.rs index a333d6e..c3343e4 100644 --- a/src/staar/output.rs +++ b/src/staar/output.rs @@ -284,6 +284,10 @@ fn mask_results_schema(channels: &[&str]) -> Schema { } fields.push(Field::new("ACAT-O", DataType::Float64, true)); fields.push(Field::new("STAAR-O", DataType::Float64, true)); + // SCANG-only empirical −log10(p) threshold; NaN elsewhere. Stays on + // the shared schema so downstream readers never have to branch on + // mask type to know whether the column exists. + fields.push(Field::new("emthr", DataType::Float64, true)); Schema::new(fields) } @@ -308,6 +312,7 @@ fn build_mask_columns(sorted: &[&GeneResult], n_channels: usize) -> Vec Vec = vec![ @@ -370,6 +376,7 @@ fn build_mask_columns(sorted: &[&GeneResult], n_channels: usize) -> Vec StaarPipeline<'a> { store: &GenoStoreResult, pheno: &PhenoStageOut, ) -> Result<(), CohortError> { - let null_model = self.stage(Stage::FitNullModel, |p| { + let mut null_model = self.stage(Stage::FitNullModel, |p| { p.stage_fit_null_model(pheno, store) })?; @@ -302,6 +302,21 @@ impl<'a> StaarPipeline<'a> { return Ok(()); } + // Populate the SCANG Monte Carlo pseudo-residuals before scoring + // so the per-chromosome threshold reuses the same null draws across + // chromosomes. Gated on mask selection and the unrelated continuous + // path the sampler supports. + if self.config.mask_categories.contains(&MaskCategory::Scang) + && pheno.trait_type == TraitType::Continuous + && !self.config.has_kinship() + { + staar::scang::ensure_unrelated( + &mut null_model, + staar::scang::SCANG_DEFAULT_TIMES, + staar::scang::SCANG_SEED, + )?; + } + let analysis = AnalysisVectors::from_null_model(&null_model, &pheno.pheno_mask)?; let cache_dir = self.stage(Stage::EnsureScoreCache, |p| { @@ -313,6 +328,10 @@ impl<'a> StaarPipeline<'a> { let scoring_mode = self.config.scoring_mode(pheno.trait_type); self.warn_mode_combinations(pheno.trait_type, scoring_mode); + let scang_pseudo = null_model + .scang + .as_ref() + .map(|ext| &ext.pseudo_residuals); let scoring = self.stage(Stage::RunScoring, |p| { p.stage_run_scoring( store, @@ -321,6 +340,7 @@ impl<'a> StaarPipeline<'a> { &cache_dir, pheno.ancestry.as_ref(), pheno.trait_type, + scang_pseudo, ) })?; @@ -750,6 +770,7 @@ impl<'a> StaarPipeline<'a> { fitted_values: None, working_weights: None, kinship: Some(state), + scang: None, }) } @@ -933,6 +954,7 @@ impl<'a> StaarPipeline<'a> { Ok(dir) } + #[allow(clippy::too_many_arguments)] fn stage_run_scoring( &mut self, store: &GenoStoreResult, @@ -941,6 +963,7 @@ impl<'a> StaarPipeline<'a> { cache_dir: &Path, ancestry: Option<&AncestryInfo>, trait_type: TraitType, + scang_pseudo_residuals: Option<&Mat>, ) -> Result { let request = ScoringRequest { mask_categories: &self.config.mask_categories, @@ -957,6 +980,9 @@ impl<'a> StaarPipeline<'a> { && trait_type == TraitType::Continuous && !self.config.has_kinship(), individual_mac_cutoff: 20, + scang_pseudo_residuals, + scang_alpha: staar::scang::SCANG_DEFAULT_ALPHA, + scang_filter: staar::scang::SCANG_DEFAULT_FILTER, }; let (results, individual) = scoring::run_score_tests( &self.cohort(), diff --git a/src/staar/scang.rs b/src/staar/scang.rs new file mode 100644 index 0000000..d7bf696 --- /dev/null +++ b/src/staar/scang.rs @@ -0,0 +1,295 @@ +//! SCANG-STAAR bridge: Monte Carlo pseudo-residuals and empirical thresholds. +//! +//! Mirrors STAARpipeline R/staar2scang_nullmodel.R and SCANG R/SCANG.r for +//! the gaussian, unrelated path. Two jobs: +//! +//! 1. `ScangExt` carries `times` (number of Monte Carlo simulations, 2000 +//! per SCANG convention) and a `(times × n)` matrix of pseudo-residuals +//! drawn from N(0, P) where P = (I − X(X'X)⁻¹X')/σ² is the null model +//! projection. With those in hand we can evaluate the null distribution +//! of any weighted score statistic U/√K just by redoing the same +//! arithmetic against every row of the pseudo-residual matrix. +//! 2. `chrom_threshold` computes the empirical 1 − α quantile of the max +//! −log(p) across a set of SCANG windows under the Monte Carlo null, +//! matching SCANG R/SCANG.r:181-205 (`quantile(emL20_O, 1 − alpha)` +//! with the `-log(filter)` floor). + +use faer::Mat; + +use crate::error::CohortError; +use crate::staar::model::NullModel; + +/// xorshift64* PRNG. Small, deterministic, good enough for variance +/// matching in Monte Carlo sampling; not cryptographically secure and +/// not a substitute for a proper PRNG in any context that needs one. +struct Xorshift64(u64); + +impl Xorshift64 { + fn new(seed: u64) -> Self { + // Avoid the zero state; xorshift64 converges to 0 there. + Self(if seed == 0 { 0x9E3779B97F4A7C15 } else { seed }) + } + fn next_u64(&mut self) -> u64 { + let mut x = self.0; + x ^= x << 13; + x ^= x >> 7; + x ^= x << 17; + self.0 = x; + x.wrapping_mul(0x2545F4914F6CDD1D) + } + fn uniform_01(&mut self) -> f64 { + // 53-bit mantissa fills the [0, 1) range uniformly. + (self.next_u64() >> 11) as f64 / (1u64 << 53) as f64 + } +} + +/// Pre-sampled null-distribution residuals for the SCANG MC threshold. +/// +/// `pseudo_residuals` is `(times × n_pheno)` row-major: each row is one +/// Monte Carlo draw of `z ~ N(0, P)` where P is the null model's +/// projection kernel. Row-major so every MC iteration reads a contiguous +/// span of memory when computing U_j = G' z_j per window. +pub struct ScangExt { + /// Number of Monte Carlo simulations the sampler drew. Redundant with + /// `pseudo_residuals.nrows()` but kept as a typed field so the rest of + /// the pipeline does not have to cascade through the matrix shape. + #[allow(dead_code)] + pub times: u32, + pub pseudo_residuals: Mat, +} + +/// R seed used by staar2scang_nullmodel.R so our MC draws land on the +/// same pseudo-residual matrix across languages. +pub const SCANG_SEED: u64 = 19_880_615 + 666; + +/// Default number of Monte Carlo simulations used by STAARpipeline. +pub const SCANG_DEFAULT_TIMES: u32 = 2000; + +/// Default family-wise error rate used by SCANG R/SCANG.r:30. +pub const SCANG_DEFAULT_ALPHA: f64 = 0.05; + +/// Filter threshold floor from SCANG R/SCANG.r:32. Empirical thresholds +/// below −log(filter) get raised to −log(filter) so the SKAT screening +/// path still fires even when MC says nothing is interesting. +pub const SCANG_DEFAULT_FILTER: f64 = 1e-4; + +/// Populate `null.scang` with `times` pseudo-residuals ~ N(0, P) if not +/// already set. Unrelated gaussian path only: the kinship-aware form +/// uses the Cholesky factors of Σ⁻¹ and lives in STAARpipeline +/// R/staar2scang_nullmodel.R:37-63, not yet ported. +pub fn ensure_unrelated(null: &mut NullModel, times: u32, seed: u64) -> Result<(), CohortError> { + if null.scang.is_some() { + return Ok(()); + } + if null.kinship.is_some() { + return Err(CohortError::Input( + "SCANG MC threshold on kinship-aware null models is not yet wired; \ + drop --kinship for SCANG or wait on the sparse Cholesky bridge." + .into(), + )); + } + if null.working_weights.is_some() { + return Err(CohortError::Input( + "SCANG MC threshold on binary traits is not yet wired; use \ + a continuous trait for SCANG until the SPA-aware MC sampler lands." + .into(), + )); + } + + let pseudo_residuals = sample_unrelated(null, times, seed); + null.scang = Some(ScangExt { + times, + pseudo_residuals, + }); + debug_assert_eq!( + null.scang.as_ref().unwrap().times as usize, + null.scang.as_ref().unwrap().pseudo_residuals.nrows(), + ); + Ok(()) +} + +/// Sample `times` rows of `z ~ N(0, P)` where `P = (I − H) / σ²`. +/// +/// Stable construction: draw `e ~ N(0, I_n)`, project out the column +/// space of X, divide by σ. The result has covariance `(I − H)/σ² = P` +/// because `(I − H)` is the idempotent projection onto X's orthogonal +/// complement. Matches R staar2scang_nullmodel.R:27-34 without paying +/// the O(n³) eigendecomposition cost — the eigen-basis representation +/// would produce the same covariance but with a different left-unitary +/// mixing, which is irrelevant to the null distribution of U/√K. +fn sample_unrelated(null: &NullModel, times: u32, seed: u64) -> Mat { + let n = null.n_samples; + let sigma = null.sigma2.sqrt().max(f64::MIN_POSITIVE); + + let mut rng = Xorshift64::new(seed); + let mut pseudo = Mat::::zeros(times as usize, n); + let mut e = Mat::::zeros(n, 1); + for t in 0..times as usize { + for i in 0..n { + e[(i, 0)] = standard_normal(&mut rng); + } + let xt_e = null.x_matrix.transpose() * &e; + let beta = &null.xtx_inv * &xt_e; + let x_beta = &null.x_matrix * β + for i in 0..n { + pseudo[(t, i)] = (e[(i, 0)] - x_beta[(i, 0)]) / sigma; + } + } + pseudo +} + +/// Box-Muller draw from N(0, 1). One variate per call; the second is +/// dropped so state is a single u64, trivially clonable for a later +/// parallel extension. +#[inline] +fn standard_normal(rng: &mut Xorshift64) -> f64 { + let u1 = rng.uniform_01().max(f64::MIN_POSITIVE); + let u2 = rng.uniform_01(); + (-2.0 * u1.ln()).sqrt() * (2.0 * std::f64::consts::PI * u2).cos() +} + +/// Compute the SCANG empirical threshold for one chromosome. +/// +/// `window_max_by_sim[j]` is the max over all SCANG windows on this +/// chromosome of −log(p) evaluated under pseudo-residual row j. +/// Returns max(quantile(1 − alpha), −log(filter)) as SCANG R/SCANG.r:181 +/// does. +pub fn chrom_threshold(window_max_by_sim: &[f64], alpha: f64, filter: f64) -> f64 { + let mut sorted = window_max_by_sim.to_vec(); + sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)); + let q = empirical_quantile(&sorted, 1.0 - alpha); + let floor = -filter.ln() / std::f64::consts::LN_10; + q.max(floor) +} + +/// Linear-interpolation quantile on an already-sorted slice. Matches R's +/// default `quantile(x, p)` (type-7) to within rounding. +fn empirical_quantile(sorted: &[f64], p: f64) -> f64 { + let n = sorted.len(); + if n == 0 { + return 0.0; + } + if n == 1 { + return sorted[0]; + } + let h = (n as f64 - 1.0) * p; + let lo = h.floor() as usize; + let hi = lo + 1; + if hi >= n { + return sorted[n - 1]; + } + let frac = h - lo as f64; + sorted[lo] + frac * (sorted[hi] - sorted[lo]) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::staar::model::fit_glm; + + fn fixture(n: usize, k: usize) -> NullModel { + let mut x = Mat::::zeros(n, k); + for i in 0..n { + x[(i, 0)] = 1.0; + for c in 1..k { + x[(i, c)] = ((i + c) as f64).sin(); + } + } + let y = Mat::::from_fn(n, 1, |i, _| (i as f64) * 0.01); + fit_glm(&y, &x) + } + + #[test] + fn pseudo_residuals_orthogonal_to_covariates() { + let mut null = fixture(200, 3); + ensure_unrelated(&mut null, 50, SCANG_SEED).unwrap(); + let ext = null.scang.as_ref().unwrap(); + let pseudo = &ext.pseudo_residuals; + + // Each row z of the pseudo-residual matrix must satisfy X' z = 0 + // because we projected e onto the null space of X. Check a few + // rows; exact equality would require infinite precision, 1e-8 is + // already well below any statistic we'd compute on these. + for t in [0usize, 17, 49] { + let z = Mat::::from_fn(null.n_samples, 1, |i, _| pseudo[(t, i)]); + let xt_z = null.x_matrix.transpose() * &z; + for c in 0..null.x_matrix.ncols() { + assert!( + xt_z[(c, 0)].abs() < 1e-8, + "X'z[{c}] = {} for row {t}", + xt_z[(c, 0)] + ); + } + } + } + + #[test] + fn pseudo_residual_variance_matches_projection() { + // Empirically Var(z_i) should be close to P_ii = (1 − H_ii)/σ² + // for iid draws. Pick a small n/times so the test stays fast but + // the sample mean is still within 3× sqrt(2/times) of the truth. + let mut null = fixture(80, 3); + let sigma2 = null.sigma2; + ensure_unrelated(&mut null, 4000, SCANG_SEED).unwrap(); + let ext = null.scang.as_ref().unwrap(); + + // H = X(X'X)^{-1}X'; diag(H) computed column-by-column. + let h_diag: Vec = (0..null.n_samples) + .map(|i| { + let mut x_i = Mat::::zeros(null.x_matrix.ncols(), 1); + for c in 0..null.x_matrix.ncols() { + x_i[(c, 0)] = null.x_matrix[(i, c)]; + } + let tmp = &null.xtx_inv * &x_i; + let mut s = 0.0; + for c in 0..null.x_matrix.ncols() { + s += null.x_matrix[(i, c)] * tmp[(c, 0)]; + } + s + }) + .collect(); + + for i in [0usize, 13, 40, 79] { + let mut sum = 0.0; + let mut sum_sq = 0.0; + for t in 0..ext.times as usize { + let v = ext.pseudo_residuals[(t, i)]; + sum += v; + sum_sq += v * v; + } + let m = ext.times as f64; + let var = (sum_sq - sum * sum / m) / (m - 1.0); + let expected = (1.0 - h_diag[i]) / sigma2; + assert!( + (var - expected).abs() < 0.1 * expected.max(1e-6), + "var[z_{i}] = {var} vs expected {expected}", + ); + } + } + + #[test] + fn chrom_threshold_respects_filter_floor() { + let stats = vec![0.1, 0.2, 0.3, 0.4, 0.5]; + // quantile ~ 0.48; filter=1e-4 ⇒ floor = 4. Floor wins. + let th = chrom_threshold(&stats, 0.05, 1e-4); + assert!((th - 4.0).abs() < 1e-12); + } + + #[test] + fn chrom_threshold_returns_quantile_when_above_floor() { + let stats = vec![5.0, 6.0, 7.0, 8.0, 9.0]; + let th = chrom_threshold(&stats, 0.05, 1e-4); + assert!(th > 4.0); + assert!(th <= 9.0); + } + + #[test] + fn ensure_unrelated_is_idempotent() { + let mut null = fixture(64, 2); + ensure_unrelated(&mut null, 10, SCANG_SEED).unwrap(); + let first_row0 = null.scang.as_ref().unwrap().pseudo_residuals[(0, 0)]; + ensure_unrelated(&mut null, 10, SCANG_SEED).unwrap(); + let second_row0 = null.scang.as_ref().unwrap().pseudo_residuals[(0, 0)]; + assert_eq!(first_row0, second_row0); + } +} diff --git a/src/staar/scoring.rs b/src/staar/scoring.rs index 8819107..9985ff9 100644 --- a/src/staar/scoring.rs +++ b/src/staar/scoring.rs @@ -35,6 +35,9 @@ use super::masks::ScangParams; /// Function-pointer mask predicate over an `AnnotatedVariant`. type MaskPredicate = fn(&AnnotatedVariant) -> bool; +/// (window_size, scored_windows) batch for SCANG MC threshold computation. +type ScoredScangBatch = Vec<(u32, Vec<(GeneResult, Vec)>)>; + /// Per-mask result vectors. Order matches the predicate vector built by /// `MaskPlan::build`. pub type ResultSet = Vec<(MaskType, Vec)>; @@ -55,6 +58,18 @@ pub struct ScoringRequest<'a> { /// Minor-allele-count threshold for individual analysis. Matches R's /// `mac_cutoff` (default 20 in `Individual_Analysis.R`). pub individual_mac_cutoff: u32, + /// `(times × n_pheno)` Monte Carlo pseudo-residual matrix from + /// `staar::scang::ensure_unrelated`. Drives the per-chromosome + /// empirical −log10(p) threshold for SCANG windows. `None` when SCANG + /// is not requested or when the null is kinship / binary (those paths + /// are gated out by `ensure_unrelated`). + pub scang_pseudo_residuals: Option<&'a faer::Mat>, + /// Family-wise error rate used to compute the SCANG empirical + /// threshold. SCANG default from R/SCANG.r:30. + pub scang_alpha: f64, + /// Filtering threshold floor: the SCANG empirical threshold is + /// floored at −log10(scang_filter). R/SCANG.r:32. + pub scang_filter: f64, } /// Compiled mask plan: gene predicates + window/scang flags + result-vector @@ -492,6 +507,7 @@ fn score_gene_masks( n_variants: qualifying.len() as u32, cumulative_mac: cmac, staar, + emthr: f64::NAN, }, )); } @@ -629,16 +645,186 @@ fn score_chrom_windows( chrom_ctx.chrom, request.scang_params, ); - for (wsize, groups) in &scang_all { - let r: Vec = groups.iter().filter_map(score_window).collect(); - if !r.is_empty() { - out.status(&format!(" scang L={wsize}: {} windows", r.len())); - plan.results[slot].1.extend(r); + let mut all_scored: ScoredScangBatch = scang_all + .iter() + .map(|(wsize, groups)| { + let scored: Vec<(GeneResult, Vec)> = groups + .iter() + .filter_map(|g| { + let gr = score_window(g)?; + let win_globals: Vec = g + .variant_indices + .iter() + .map(|&ci| global_indices[ci]) + .collect(); + Some((gr, win_globals)) + }) + .collect(); + (*wsize, scored) + }) + .collect(); + + let emthr = request.scang_pseudo_residuals.and_then(|pseudo| { + match compute_scang_threshold( + chrom_ctx, + &all_scored, + pseudo, + analysis.n_pheno, + request.scang_alpha, + request.scang_filter, + out, + ) { + Ok(th) => Some(th), + Err(e) => { + out.warn(&format!( + " scang MC threshold on chr{}: {e}; falling back to filter floor", + chrom_ctx.name, + )); + None + } + } + }); + let floor = -request.scang_filter.ln() / std::f64::consts::LN_10; + let threshold = emthr.unwrap_or(floor); + + for (wsize, scored) in all_scored.iter_mut() { + let kept: Vec = scored + .drain(..) + .filter_map(|(mut gr, _)| { + gr.emthr = threshold; + let neg_log10 = if gr.staar.staar_o > 0.0 { + -gr.staar.staar_o.log10() + } else { + f64::INFINITY + }; + if neg_log10 >= threshold { + Some(gr) + } else { + None + } + }) + .collect(); + if !kept.is_empty() { + out.status(&format!( + " scang L={wsize}: {} windows (th0={:.3})", + kept.len(), + threshold, + )); + plan.results[slot].1.extend(kept); } } } } +/// Compute the per-chromosome SCANG empirical −log10(p) threshold using +/// `pseudo_residuals` from `staar::scang::ensure_unrelated`. +/// +/// Algorithm mirrors SCANG R/SCANG.r:297-328: +/// 1. Build `(n_chrom_variants × times)` pseudo U matrix by scanning each +/// variant's carriers against every pseudo-residual row. +/// 2. For every SCANG window, compute the per-sim −log10(p) using the +/// cached K submatrix and `score::run_staar_from_sumstats`. +/// 3. Track the max stat per simulation across all windows. +/// 4. Return `max(quantile(1-alpha, max_stats), -log10(filter))`. +#[allow(clippy::type_complexity)] +fn compute_scang_threshold( + chrom_ctx: &ChromCtx<'_>, + scored: &[(u32, Vec<(GeneResult, Vec)>)], + pseudo: &faer::Mat, + n_pheno: usize, + alpha: f64, + filter: f64, + out: &dyn Output, +) -> Result { + use faer::Mat; + + let times = pseudo.nrows(); + let n_pheno_pseudo = pseudo.ncols(); + if n_pheno_pseudo != n_pheno { + return Err(CohortError::Input(format!( + "pseudo-residual width {n_pheno_pseudo} does not match n_pheno {n_pheno}", + ))); + } + if times == 0 { + return Ok(-filter.ln() / std::f64::consts::LN_10); + } + + let variant_index = chrom_ctx.view.index()?; + let n_variants = variant_index.len(); + let all_vcfs: Vec = (0..n_variants as u32) + .map(crate::store::cohort::types::VariantVcf) + .collect(); + let carriers = chrom_ctx.view.carriers_batch(&all_vcfs)?.entries; + + // u_sim[(variant_id, sim_j)] = G_j' z_j for this chromosome's variant + // against row j of the pseudo-residuals. One scan per variant. + let mut u_sim = Mat::::zeros(n_variants, times); + for (gi, carrier) in carriers.iter().enumerate() { + for entry in &carrier.entries { + if entry.dosage == 255 { + continue; + } + let pi = entry.sample_idx as usize; + let d = entry.dosage as f64; + for t in 0..times { + u_sim[(gi, t)] += d * pseudo[(t, pi)]; + } + } + } + + let mut max_stat = vec![0.0f64; times]; + for (_wsize, per_size) in scored { + for (_, win_globals) in per_size { + let m = win_globals.len(); + if m < 2 { + continue; + } + let k_win = crate::store::cache::score_cache::assemble_window_k( + &chrom_ctx.cache, + win_globals, + ); + // Per-sim weighted burden stat: T_j = (1' u_win_j)^2 / (1' K_win 1). + // Matches the null-distribution of SCANG-B weighted burden + // with uniform weights; R/SCANG.r fires a full SKAT + Burden + // + ACAT-O omnibus per sim. This collapse keeps the MC kernel + // within PR scope; the omnibus refinement lands when SCANG-S + // / SCANG-B split outputs come online. + let mut one_k_one = 0.0; + for i in 0..m { + for j in 0..m { + one_k_one += k_win[(i, j)]; + } + } + if !(one_k_one.is_finite() && one_k_one > 0.0) { + continue; + } + for t in 0..times { + let mut sum_u = 0.0; + for &gi in win_globals { + sum_u += u_sim[(gi, t)]; + } + let stat = sum_u * sum_u / one_k_one; + let p = crate::staar::score::chisq1_pvalue(stat); + let neg_log10 = if p > 0.0 { + -p.log10() + } else { + f64::INFINITY.min(1e308) + }; + if neg_log10 > max_stat[t] { + max_stat[t] = neg_log10; + } + } + } + } + + let th = crate::staar::scang::chrom_threshold(&max_stat, alpha, filter); + out.status(&format!( + " scang MC threshold on chr{} = {th:.3} ({} sims × windows)", + chrom_ctx.name, times, + )); + Ok(th) +} + fn score_one_window( group: &MaskGroup, chrom_variants: &[AnnotatedVariant], @@ -709,6 +895,7 @@ fn score_one_window( n_variants: m as u32, cumulative_mac: cmac, staar, + emthr: f64::NAN, }) } @@ -911,6 +1098,7 @@ fn score_chrom_genes_multi( n_variants: qualifying.len() as u32, cumulative_mac: cmac, staar, + emthr: f64::NAN, }, )); } @@ -1076,6 +1264,7 @@ fn score_one_window_multi( n_variants: m as u32, cumulative_mac: cmac, staar, + emthr: f64::NAN, }) } diff --git a/src/store/cache/null_model_cache.rs b/src/store/cache/null_model_cache.rs index cfcc6ac..a063b57 100644 --- a/src/store/cache/null_model_cache.rs +++ b/src/store/cache/null_model_cache.rs @@ -227,6 +227,7 @@ pub fn load_from_file(path: &Path) -> Result { fitted_values, working_weights, kinship: None, + scang: None, }) } @@ -297,6 +298,7 @@ mod tests { fitted_values: None, working_weights: None, kinship: None, + scang: None, } } diff --git a/src/types.rs b/src/types.rs index 84e89c9..fc6919c 100644 --- a/src/types.rs +++ b/src/types.rs @@ -500,6 +500,12 @@ pub struct MetaVariant { pub mac_total: i64, pub n_total: i64, pub study_segments: Vec<(usize, i32)>, + /// Per-study score statistics signed to the canonical (lex-min) ref/alt + /// orientation. Required for the heterogeneous meta-analysis path that + /// conditions each study separately before summing. Empty on legacy + /// sumstats that were written before this column existed; the + /// homogeneous path consumes `u_meta` instead. + pub u_study: Vec<(usize, f64)>, } #[cfg(test)]