From 82a42718f836a8fdfc189acad9a3d2070bdeb3fb Mon Sep 17 00:00:00 2001 From: Vineet Date: Mon, 20 Apr 2026 18:12:50 -0400 Subject: [PATCH] staar: multi-binary, multi-kinship, and random.slope nulls Port MultiNull::fit_binary (joint k-binary, lift of single-trait logistic + MultiSTAAR Kronecker; GMMAT rejects multi-binomial so no upstream), multi_kinship::fit_multi_kinship (port of GMMAT glmmkin.multi.ai), and kinship::fit_reml_random_slope (port of glmmkin random.slope expansion with PSD-box step-halving). k=1 parity bit-for-bit vs single-trait on all three paths. Closes #111, #80, #101. --- src/cli.rs | 8 + src/commands/staar.rs | 78 +- src/main.rs | 2 + src/staar/kinship/load.rs | 70 ++ src/staar/kinship/mod.rs | 341 ++++++++- src/staar/kinship/reml.rs | 158 +++- src/staar/kinship/types.rs | 24 + src/staar/mod.rs | 12 +- src/staar/model.rs | 116 ++- src/staar/multi.rs | 358 ++++++++- src/staar/multi_kinship.rs | 1405 ++++++++++++++++++++++++++++++++++++ src/staar/output.rs | 4 +- src/staar/pipeline.rs | 131 +++- src/staar/scoring.rs | 10 +- 14 files changed, 2598 insertions(+), 119 deletions(-) create mode 100644 src/staar/multi_kinship.rs diff --git a/src/cli.rs b/src/cli.rs index 9115547..673d981 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -275,6 +275,14 @@ pub enum Command { #[arg(long)] kinship_groups: Option, + /// Phenotype column carrying per-sample time values for the + /// longitudinal random-slope LMM (GMMAT `glmmkin.R:174-183`). + /// Each input --kinship matrix expands into three variance + /// components (intercept, intercept-slope covariance, slope). + /// Requires --kinship; gaussian family only; single-trait only. + #[arg(long)] + random_slope: Option, + /// Known loci file for conditional analysis (one chr:pos:ref:alt per line) #[arg(long)] known_loci: Option, diff --git a/src/commands/staar.rs b/src/commands/staar.rs index 91f378d..de12627 100644 --- a/src/commands/staar.rs +++ b/src/commands/staar.rs @@ -37,6 +37,7 @@ pub struct StaarArgs { pub scang_step: usize, pub kinship: Vec, pub kinship_groups: Option, + pub random_slope: Option, pub known_loci: Option, pub null_model: Option, pub emit_sumstats: bool, @@ -78,11 +79,12 @@ fn build_config(args: StaarArgs) -> Result { )); } - // Multi-trait dispatch. Joint MultiSTAAR covers unrelated continuous - // traits with a shared covariate matrix; everything else (kinship-aware - // joint null, SPA for binary, AI-STAAR ancestry weighting, sumstats - // dump) is a separate track. Reject the combinations at config-build - // time so the pipeline never sees an invalid state. + // Multi-trait dispatch. Joint MultiSTAAR covers unrelated traits + // (gaussian or binomial) with a shared covariate matrix; kinship-aware + // joint null, single-trait SPA calibration, AI-STAAR ancestry + // weighting, and MetaSTAAR sumstats stay on their own tracks for now. + // Reject the combinations at config-build time so the pipeline never + // sees an invalid state. let multi_trait = args.trait_names.len() > 1; if multi_trait { let n_traits = args.trait_names.len(); @@ -95,7 +97,9 @@ fn build_config(args: StaarArgs) -> Result { if args.spa { return Err(reject( "--spa", - "SPA applies to binary traits; joint MultiSTAAR is unrelated continuous only", + "SPA is a single-trait tail-calibration path; joint MultiSTAAR \ + routes every per-variant common test through the joint χ²(k) \ + kernel instead", )); } if blank_to_none(args.ancestry_col.clone()).is_some() { @@ -104,23 +108,45 @@ fn build_config(args: StaarArgs) -> Result { "AI-STAAR ancestry weighting has not been composed with the joint multi-trait kernel", )); } - if !args.kinship.is_empty() { + // Multi-trait kinship + `--kinship-groups` are supported via + // `MultiNull::fit_with_kinship` (port of GMMAT's + // `fit_null_glmmkin_multi`, dense-backend first cut). The + // `--ancestry-col` reject above already covers the AI-STAAR + // combination which has not been composed with the kinship + // kernel yet. + if args.emit_sumstats { return Err(reject( - "--kinship", - "kinship-aware joint null (fit_null_glmmkin_multi) is not yet implemented; \ - the current path assumes unrelated samples", + "--emit-sumstats", + "MetaSTAAR sumstats are single-trait; run each trait separately to dump U/K", )); } - if blank_to_none(args.kinship_groups.clone()).is_some() { + if blank_to_none(args.random_slope.clone()).is_some() { return Err(reject( - "--kinship-groups", - "kinship-aware joint null is not yet implemented", + "--random-slope", + "GMMAT `glmmkin.R:50` rejects multi-pheno + duplicated ids; the joint \ + multi-trait null does not compose with random slopes", )); } - if args.emit_sumstats { - return Err(reject( - "--emit-sumstats", - "MetaSTAAR sumstats are single-trait; run each trait separately to dump U/K", + } + + // `--random-slope` requires kinship and gaussian family (single-trait + // binary runs through PQL which doesn't implement the random-slope + // expansion; GMMAT `glmmkin.R:30` enforces `method.optim == "AI"` for + // random.slope which is what our `fit_reml_random_slope` uses). + if blank_to_none(args.random_slope.clone()).is_some() { + if args.kinship.is_empty() { + return Err(CohortError::Input( + "--random-slope requires at least one --kinship matrix. GMMAT \ + `glmmkin.R:64` warns and drops random.slope when samples are unrelated \ + and there is no kinship." + .into(), + )); + } + if args.spa { + return Err(CohortError::Input( + "--random-slope is gaussian-only. --spa applies to binary traits and is \ + not compatible with the random-slope LMM." + .into(), )); } } @@ -232,6 +258,7 @@ fn build_config(args: StaarArgs) -> Result { }, kinship: args.kinship, kinship_groups: blank_to_none(args.kinship_groups), + random_slope: blank_to_none(args.random_slope), known_loci: args.known_loci, null_model_path: args.null_model, run_mode: if multi_trait { @@ -563,6 +590,7 @@ mod tests { scang_step: 10, kinship: Vec::new(), kinship_groups: None, + random_slope: None, known_loci: None, null_model: None, emit_sumstats: false, @@ -621,22 +649,6 @@ mod tests { assert_input_error_mentions(build_config(args), "--ancestry-col"); } - #[test] - fn multi_trait_rejects_kinship() { - let pheno = pheno_file(); - let mut args = multi_trait_args(pheno.path().to_path_buf(), &["BMI", "HEIGHT"]); - args.kinship = vec![PathBuf::from("/no/such/kin.tsv")]; - assert_input_error_mentions(build_config(args), "--kinship"); - } - - #[test] - fn multi_trait_rejects_kinship_groups() { - let pheno = pheno_file(); - let mut args = multi_trait_args(pheno.path().to_path_buf(), &["BMI", "HEIGHT"]); - args.kinship_groups = Some("ancestry".into()); - assert_input_error_mentions(build_config(args), "--kinship-groups"); - } - #[test] fn multi_trait_rejects_emit_sumstats() { let pheno = pheno_file(); diff --git a/src/main.rs b/src/main.rs index ffab530..4f6983f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -154,6 +154,7 @@ fn run( scang_step, kinship, kinship_groups, + random_slope, known_loci, null_model, emit_sumstats, @@ -187,6 +188,7 @@ fn run( scang_step, kinship, kinship_groups, + random_slope, known_loci, null_model, emit_sumstats, diff --git a/src/staar/kinship/load.rs b/src/staar/kinship/load.rs index 63691bd..f67f986 100644 --- a/src/staar/kinship/load.rs +++ b/src/staar/kinship/load.rs @@ -244,6 +244,76 @@ pub fn load_groups( Ok(partition) } +/// Load the per-sample time vector for the random-slope LMM. Ports the +/// `time.var <- data[idx, random.slope]` extraction at upstream +/// `GMMAT/R/glmmkin.R:113`. Rows are returned in the same compacted +/// order `load_phenotype` produces (i.e., filtered by `pheno_mask` and +/// aligned with `geno.sample_names`), so `time_var[i]` aligns with the +/// i-th row of `y` / `x`. +pub fn load_random_slope( + engine: &DfEngine, + phenotype: &Path, + time_col: &str, + geno: &GenotypeResult, + pheno_mask: &[bool], + column_map: &HashMap, + out: &dyn Output, +) -> Result, CohortError> { + engine.register_csv("_pheno_rs", phenotype, b'\t')?; + let cols = engine.table_columns("_pheno_rs")?; + + let resolved = + crate::staar::model::resolve_column(time_col, &cols, column_map, "RandomSlope")?; + let id_col = crate::staar::model::resolve_id_column(&cols, column_map); + let sample_list = geno + .sample_names + .iter() + .map(|s| format!("'{s}'")) + .collect::>() + .join(","); + + let sql = format!( + "SELECT \"{id_col}\", CAST(\"{resolved}\" AS DOUBLE) \ + FROM _pheno_rs \ + WHERE \"{id_col}\" IN ({sample_list}) AND \"{resolved}\" IS NOT NULL" + ); + let batches = engine.collect(&sql)?; + + let mut id_to_value: HashMap = HashMap::new(); + for batch in &batches { + for row in 0..batch.num_rows() { + let id = crate::staar::model::arrow_str(batch.column(0).as_ref(), row); + let v = crate::staar::model::arrow_f64(batch.column(1).as_ref(), row); + if !v.is_finite() { + return Err(CohortError::Input(format!( + "random.slope column '{resolved}' has non-finite value for sample '{id}'" + ))); + } + id_to_value.insert(id, v); + } + } + + let mut out_vec: Vec = Vec::new(); + for (vcf_idx, name) in geno.sample_names.iter().enumerate() { + if !pheno_mask[vcf_idx] { + continue; + } + let v = *id_to_value.get(name).ok_or_else(|| { + CohortError::Input(format!( + "Sample '{name}' has no value in random.slope column '{resolved}'" + )) + })?; + out_vec.push(v); + } + out.status(&format!( + " Random-slope time values: n = {}, range = [{}, {}]", + out_vec.len(), + out_vec.iter().cloned().fold(f64::INFINITY, f64::min), + out_vec.iter().cloned().fold(f64::NEG_INFINITY, f64::max), + )); + Ok(out_vec) +} + #[cfg(test)] mod tests { use super::*; diff --git a/src/staar/kinship/mod.rs b/src/staar/kinship/mod.rs index 435c27a..c2513f6 100644 --- a/src/staar/kinship/mod.rs +++ b/src/staar/kinship/mod.rs @@ -54,10 +54,11 @@ pub mod types; pub use budget::{check_memory_budget, dense_path_fits}; #[cfg(test)] pub use budget::DEFAULT_KINSHIP_MEM_BYTES; -pub use load::{load_groups, load_kinship}; +pub use load::{load_groups, load_kinship, load_random_slope}; pub use pql::fit_pql_glmm; pub use types::{ - GroupPartition, KinshipInverse, KinshipMatrix, KinshipState, VarianceComponents, + CovarianceIdx, GroupPartition, KinshipInverse, KinshipMatrix, KinshipState, + VarianceComponents, }; use faer::Mat; @@ -192,6 +193,210 @@ pub fn fit_reml( dense::fit_reml_dense(y, x, kinships, groups, &w_vec, init_tau, budget_bytes) } +/// Expand an input kinship list for random-slope longitudinal LMM. +/// +/// Ports `GMMAT::glmmkin.R:174-183`. For each input `Φ_l` and a +/// per-sample time vector `t`, emit three entries: +/// +/// * intercept variance `Φ_l` — left in place (same matrix as input). +/// * intercept-slope covariance `Φ_l · diag(t) + diag(t) · Φ_l`. +/// * slope variance `Φ_l .* (t · tᵀ)` — element-wise product of `Φ_l` +/// with the outer product of `t` with itself. +/// +/// Output order is `[int_1..int_L, cov_1..cov_L, slope_1..slope_L]`, so +/// a caller building the `VarianceComponents` layout gets the natural +/// (kinship-major, purpose-minor) ordering. This matches GMMAT's +/// `kins`/`kins[[q+i]]`/`kins[[2*q+i]]` indexing at upstream +/// `glmmkin.R:176-178`. +/// +/// Both input variants (Dense / Sparse) are handled; output is always +/// a dense matrix since the column-scaled Kronecker structure destroys +/// the input sparsity pattern (the scaled matrix has `2·nnz(Φ)` non- +/// zeros at most, but the structure changes per entry). +pub fn expand_for_random_slope( + kinships: &[KinshipMatrix], + time_var: &[f64], +) -> Result, CohortError> { + let l = kinships.len(); + if l == 0 { + return Err(CohortError::Input( + "random.slope requires at least one kinship matrix — GMMAT `glmmkin.R:64` \ + drops the argument when kinship is NULL and samples are unrelated" + .into(), + )); + } + let n = time_var.len(); + for k in kinships { + if k.n() != n { + return Err(CohortError::Input(format!( + "random.slope time vector has length {} but kinship '{}' has n={}", + n, + k.label(), + k.n(), + ))); + } + } + + let to_dense = |k: &KinshipMatrix| -> Mat { + match k { + KinshipMatrix::Dense { matrix, .. } => matrix.clone(), + KinshipMatrix::Sparse { matrix, .. } => { + let mut d = Mat::::zeros(n, n); + let cp = matrix.symbolic().col_ptr(); + let ri = matrix.symbolic().row_idx(); + let val = matrix.val(); + for c in 0..n { + let s = cp[c] as usize; + let e = cp[c + 1] as usize; + for kk in s..e { + let r = ri[kk] as usize; + d[(r, c)] = val[kk]; + } + } + d + } + } + }; + + let mut intercepts = Vec::with_capacity(l); + let mut covariances = Vec::with_capacity(l); + let mut slopes = Vec::with_capacity(l); + + for (li, kin) in kinships.iter().enumerate() { + let k_dense = to_dense(kin); + let label = kin.label().to_string(); + intercepts.push(KinshipMatrix::Dense { + matrix: k_dense.clone(), + label: format!("{label}_int_{li}"), + }); + + // cov = Φ · diag(t) + diag(t) · Φ + // (i, j) = Φ[i, j] · (t_i + t_j) + let mut cov = Mat::::zeros(n, n); + for i in 0..n { + for j in 0..n { + cov[(i, j)] = k_dense[(i, j)] * (time_var[i] + time_var[j]); + } + } + covariances.push(KinshipMatrix::Dense { + matrix: cov, + label: format!("{label}_cov_{li}"), + }); + + // slope = Φ .* (t · tᵀ) + // (i, j) = Φ[i, j] · t_i · t_j + let mut slope = Mat::::zeros(n, n); + for i in 0..n { + for j in 0..n { + slope[(i, j)] = k_dense[(i, j)] * time_var[i] * time_var[j]; + } + } + slopes.push(KinshipMatrix::Dense { + matrix: slope, + label: format!("{label}_slope_{li}"), + }); + } + + let mut out = Vec::with_capacity(3 * l); + out.extend(intercepts); + out.extend(covariances); + out.extend(slopes); + Ok(out) +} + +/// Fit AI-REML with random slopes on the time variable. See +/// [`expand_for_random_slope`] for the variance-component structure. +/// +/// Input layout (caller-supplied): +/// * `kinships` — `L` original kinship matrices `Φ_1..Φ_L`. +/// * `groups` — residual group partition (pass `GroupPartition::single` +/// if no per-group heteroscedasticity is wanted). +/// * `time_var` — per-sample time values. GMMAT's semantic is "time +/// since baseline at the measurement"; any numeric column works. +/// +/// Output layout (on the returned `KinshipState.tau`): +/// * indices `0..L` — `τ` for the intercept variance of each input `Φ`. +/// * indices `L..2L` — `τ` for the intercept-slope covariance of each `Φ`. +/// * indices `2L..3L` — `τ` for the slope variance of each `Φ`. +/// * indices `3L..3L+G` — per-group residual variances (standard groups). +/// +/// The heritability field `state.h2` holds per-input-`Φ` *intercept* +/// heritability (`τ[int_l] / Σ τ`) — it is not meaningful for the cov +/// or slope components, which are not standalone variance fractions. +/// Cross-check: at `time_var = 0`, this function reduces to `fit_reml` +/// with the input kinship list (the cov and slope entries collapse to +/// zero matrices and AI-REML pins their `τ` at zero via boundary refit). +pub fn fit_reml_random_slope( + y: &Mat, + x: &Mat, + kinships: &[KinshipMatrix], + groups: &GroupPartition, + time_var: &[f64], + budget_bytes: u64, +) -> Result { + if kinships.is_empty() { + return Err(CohortError::Input( + "random.slope requires at least one kinship matrix".into(), + )); + } + let n = y.nrows(); + if time_var.len() != n { + return Err(CohortError::Input(format!( + "random.slope time vector length {} does not match phenotype n={n}", + time_var.len(), + ))); + } + if groups.n_samples() != n { + return Err(CohortError::Input(format!( + "group partition covers {} samples but y has {n} rows", + groups.n_samples() + ))); + } + + let expanded = expand_for_random_slope(kinships, time_var)?; + let l = kinships.len(); + let g = groups.n_groups(); + let triples: Vec = (0..l) + .map(|li| CovarianceIdx { + cov_idx: l + li, + var_int_idx: li, + var_slope_idx: 2 * l + li, + }) + .collect(); + + // Warm start: variance entries at var(Y)/(3L+G), covariance entries + // at 0 (per `glmmkin.R:446-447` which sets diagonal of the + // Kronecker-flattened (k×k) to var/q and off-diagonal to 0). + let y_mean: f64 = (0..n).map(|i| y[(i, 0)]).sum::() / n as f64; + let y_var: f64 = (0..n).map(|i| (y[(i, 0)] - y_mean).powi(2)).sum::() + / (n as f64 - 1.0).max(1.0); + let n_comp = 3 * l + g; + let warm = (y_var / n_comp as f64).max(1e-6); + let mut init_tau = VarianceComponents::zeros(3 * l, g); + for (li, kin) in kinships.iter().enumerate() { + let mean_diag = kin.mean_diagonal().max(1e-6); + init_tau.set_kinship(li, warm / mean_diag); // intercept + init_tau.set_kinship(l + li, 0.0); // covariance + init_tau.set_kinship(2 * l + li, warm / mean_diag); // slope + } + for gi in 0..g { + init_tau.set_group(gi, warm); + } + + let weights = vec![1.0; n]; + let state = reml::run_reml_constrained( + y, + x, + &expanded, + groups, + &weights, + init_tau, + &triples, + budget_bytes, + )?; + Ok(state) +} + #[cfg(test)] mod tests { //! Integration tests across the public dispatcher. Per-component tests @@ -332,6 +537,138 @@ mod tests { } } + #[test] + fn expand_for_random_slope_shapes_and_symmetries() { + let n = 4; + let kin = { + let mut m = Mat::::zeros(n, n); + for i in 0..n { + m[(i, i)] = 1.0; + } + m[(0, 1)] = 0.5; + m[(1, 0)] = 0.5; + KinshipMatrix::Dense { + matrix: m, + label: "k".into(), + } + }; + let t = vec![1.0, 2.0, 3.0, 4.0]; + let expanded = expand_for_random_slope(std::slice::from_ref(&kin), &t).unwrap(); + // 3L = 3 entries (intercept, cov, slope) for one input kinship. + assert_eq!(expanded.len(), 3); + + // Intercept is the input kinship untouched. + let int_mat = expanded[0].as_dense().unwrap(); + assert!((int_mat[(0, 0)] - 1.0).abs() < 1e-12); + assert!((int_mat[(0, 1)] - 0.5).abs() < 1e-12); + + // Covariance at (0,1) is K[0,1] · (t_0 + t_1) = 0.5 · 3 = 1.5. + let cov_mat = expanded[1].as_dense().unwrap(); + assert!((cov_mat[(0, 1)] - 1.5).abs() < 1e-12); + // Symmetric: cov[1,0] = cov[0,1]. + assert!((cov_mat[(1, 0)] - 1.5).abs() < 1e-12); + + // Slope at (0,1) is K[0,1] · t_0 · t_1 = 0.5 · 2 = 1.0. + let slope_mat = expanded[2].as_dense().unwrap(); + assert!((slope_mat[(0, 1)] - 1.0).abs() < 1e-12); + // Diagonal slope[i,i] = K[i,i] · t_i² = 1 · t_i². + for i in 0..n { + assert!((slope_mat[(i, i)] - t[i].powi(2)).abs() < 1e-12); + } + } + + /// Random-slope convergence smoke: repeated-measures data with a + /// within-subject time trend should fit to convergence and return + /// a PSD random-effects matrix. No upstream fixture yet — this is + /// a feasibility + structural check, not a numerical parity check. + #[test] + fn fit_reml_random_slope_converges_on_longitudinal_data() { + // 10 subjects × 3 repeated measurements each = n=30. + let n_subj = 10; + let reps = 3; + let n = n_subj * reps; + // Subject-level kinship: identity at the row level is the + // "repeated-measures" kinship that GMMAT adds when duplicated + // ids are detected (see `glmmkin.R:55-62`). For our smoke test + // we use a block-diagonal kinship where within-subject blocks + // are 1 and between-subject off-diagonal is 0 — i.e., repeated + // measurements share the same random intercept. + let mut k = Mat::::zeros(n, n); + for s in 0..n_subj { + for r1 in 0..reps { + for r2 in 0..reps { + k[(s * reps + r1, s * reps + r2)] = 1.0; + } + } + } + let kin = KinshipMatrix::Dense { + matrix: k, + label: "subject".into(), + }; + + // Time values cycle 0, 1, 2 per subject. + let time_var: Vec = (0..n).map(|i| (i % reps) as f64).collect(); + + // Deterministic phenotype with per-subject random intercept and + // slope plus residual noise. + let mut state: u64 = 9999; + let mut u01 = || { + state ^= state << 13; + state ^= state >> 7; + state ^= state << 17; + (state >> 11) as f64 / (1u64 << 53) as f64 + }; + let mut subj_int = vec![0.0; n_subj]; + let mut subj_slope = vec![0.0; n_subj]; + for s in 0..n_subj { + subj_int[s] = 2.0 * u01() - 1.0; + subj_slope[s] = (2.0 * u01() - 1.0) * 0.3; + } + let mut y = Mat::::zeros(n, 1); + let mut x = Mat::::zeros(n, 1); + for i in 0..n { + let s = i / reps; + let t = time_var[i]; + let noise = (2.0 * u01() - 1.0) * 0.5; + y[(i, 0)] = 1.0 + subj_int[s] + subj_slope[s] * t + noise; + x[(i, 0)] = 1.0; + } + + let groups = GroupPartition::single(n); + let state = fit_reml_random_slope( + &y, + &x, + std::slice::from_ref(&kin), + &groups, + &time_var, + 1 << 30, + ) + .expect("random.slope should converge on this longitudinal fixture"); + + // τ layout: [var_int, var_cov, var_slope, var_group]. + assert_eq!(state.tau.n_total(), 4); + let var_int = state.tau.as_slice()[0]; + let var_cov = state.tau.as_slice()[1]; + let var_slope = state.tau.as_slice()[2]; + let var_res = state.tau.as_slice()[3]; + + assert!(var_int.is_finite() && var_int >= 0.0, "var_int: {var_int}"); + assert!( + var_slope.is_finite() && var_slope >= 0.0, + "var_slope: {var_slope}", + ); + assert!(var_res.is_finite() && var_res >= 0.0, "var_res: {var_res}"); + assert!(var_cov.is_finite(), "var_cov: {var_cov}"); + // PSD invariant: |cov| ≤ √(var_int · var_slope). + let psd_limit = (var_int * var_slope).sqrt(); + assert!( + var_cov.abs() <= psd_limit * (1.0 + 1e-6), + "PSD violated: |cov|={} > √(var_int · var_slope)={}", + var_cov.abs(), + psd_limit, + ); + } + #[test] fn fit_pql_glmm_runs_and_returns_state() { let kin = block_family_kinship(20, 10, "fam"); diff --git a/src/staar/kinship/reml.rs b/src/staar/kinship/reml.rs index 03ae2d7..896aa6a 100644 --- a/src/staar/kinship/reml.rs +++ b/src/staar/kinship/reml.rs @@ -51,7 +51,8 @@ use crate::staar::kinship::sparse; use crate::staar::kinship::sparse::takahashi::TakahashiNumeric; use crate::staar::kinship::sparse::SparseFactor; use crate::staar::kinship::types::{ - GroupPartition, KinshipInverse, KinshipMatrix, KinshipState, VarianceComponents, + CovarianceIdx, GroupPartition, KinshipInverse, KinshipMatrix, KinshipState, + VarianceComponents, }; /// REML convergence tolerance. Upstream `glmmkin.R:122` `tol = 1e-5`. @@ -583,7 +584,13 @@ fn solve_dtau(ai: &Mat, score: &[f64], fixtau: &[bool]) -> Vec { /// Inner AI-REML iteration loop. Runs `ai_step` until the convergence /// criterion is met or `REML_MAX_ITER` is exhausted. Step-halving is /// applied per upstream `glmmkin.R:362-366` whenever the Newton step -/// would push a free τ negative. +/// would push a free τ negative (or, for random.slope covariance slots, +/// outside the PSD box `τ_cov² ≤ τ_int · τ_slope`). +/// +/// `triples` holds covariance-index triples for the random-slope path +/// (GMMAT `glmmkin.R:175-183`). An empty slice means "no covariance +/// slots" — the feasibility check reduces to `all τ ≥ 0` and behavior +/// is bit-identical to the pre-random-slope loop. #[allow(clippy::too_many_arguments)] fn converge( y: &Mat, @@ -593,6 +600,7 @@ fn converge( weights: &[f64], init_tau: VarianceComponents, fixtau: &[bool], + triples: &[CovarianceIdx], builder: &B, ) -> Result { let n = y.nrows(); @@ -604,6 +612,32 @@ fn converge( debug_assert_eq!(fixtau.len(), n_comp); let _ = n; + let mut cov_slot = vec![false; n_comp]; + for t in triples { + debug_assert!(t.cov_idx < n_comp); + debug_assert!(t.var_int_idx < n_comp); + debug_assert!(t.var_slope_idx < n_comp); + cov_slot[t.cov_idx] = true; + } + let is_feasible = |tau: &[f64]| -> bool { + for (i, &t) in tau.iter().enumerate() { + if !cov_slot[i] && t < 0.0 { + return false; + } + } + for tri in triples { + let v1 = tau[tri.var_int_idx]; + let v2 = tau[tri.var_slope_idx]; + if v1 < 0.0 || v2 < 0.0 { + return false; + } + if tau[tri.cov_idx].abs() > (v1 * v2).sqrt() { + return false; + } + } + true + }; + let mut tau = init_tau; let mut alpha_prev = Mat::::zeros(k, 1); let mut iter_used = 0; @@ -616,8 +650,9 @@ fn converge( let step_out = ai_step(y, x, kinships, groups, weights, &tau, fixtau, solver)?; - // Newton step + step-halving on negative τ. Matches upstream - // `glmmkin.R:357-367`. Free components update; fixed stay put. + // Newton step + step-halving on non-feasible τ. Matches upstream + // `glmmkin.R:357-367` (plain) and `:377-389` (covariance-constrained). + // Free components update; fixed stay put. let mut step = step_out.d_tau.clone(); let mut try_count = 0; loop { @@ -626,13 +661,19 @@ fn converge( tau.as_slice_mut()[i] = tau0.as_slice()[i] + step[i]; } } - for i in 0..n_comp { + // Clip non-covariance slots that are near zero on both + // sides of the step down to zero (matches upstream's + // `tau[tau < tol & tau0 < tol] <- 0`). + for (i, &is_cov) in cov_slot.iter().enumerate() { + if is_cov { + continue; + } let v = tau.as_slice()[i]; if v < REML_TOL && tau0.as_slice()[i] < REML_TOL { tau.as_slice_mut()[i] = 0.0; } } - if tau.as_slice().iter().all(|&t| t >= 0.0) { + if is_feasible(tau.as_slice()) { break; } for s in step.iter_mut() { @@ -645,9 +686,25 @@ fn converge( ))); } } - for v in tau.as_slice_mut().iter_mut() { - if *v < REML_TOL { - *v = 0.0; + for (i, &is_cov) in cov_slot.iter().enumerate() { + if is_cov { + continue; + } + if tau.as_slice()[i] < REML_TOL { + tau.as_slice_mut()[i] = 0.0; + } + } + // Snap covariance slots onto the PSD edge if they've drifted up + // against it — avoids oscillation right at the boundary. + // Mirrors `glmmkin.R:388-389`. + for tri in triples { + let v1 = tau.as_slice()[tri.var_int_idx].max(0.0); + let v2 = tau.as_slice()[tri.var_slope_idx].max(0.0); + let edge = (1.0 - 1.01 * REML_TOL) * (v1 * v2).sqrt(); + let c = tau.as_slice()[tri.cov_idx]; + if c.abs() > edge { + tau.as_slice_mut()[tri.cov_idx] = + if c >= 0.0 { edge } else { -edge }; } } @@ -729,11 +786,84 @@ pub fn run_reml( weights: &[f64], init_tau: VarianceComponents, builder: &B, +) -> Result { + run_reml_constrained_inner( + y, + x, + kinships, + groups, + weights, + init_tau, + &[], + builder, + ) +} + +/// Variant of [`run_reml`] that accepts covariance-index triples for +/// the random-slope path (`glmmkin.R:175-183`). Every τ_cov stays +/// inside the PSD box `|τ_cov| ≤ √(τ_int · τ_slope)`. +/// +/// This is the top-level entry point the `kinship::fit_reml_random_slope` +/// wrapper calls. Callers that do not need the random-slope constraint +/// should use [`run_reml`] — passing an empty triples slice here is +/// bit-identical but more code. +#[allow(clippy::too_many_arguments)] +pub fn run_reml_constrained( + y: &Mat, + x: &Mat, + kinships: &[KinshipMatrix], + groups: &GroupPartition, + weights: &[f64], + init_tau: VarianceComponents, + triples: &[CovarianceIdx], + budget_bytes: u64, +) -> Result { + // Random-slope currently only supports the dense backend. Sparse + // multi-trait fits end-to-end through `run_reml`, but the + // covariance-box step-halving isn't yet exercised via sparse, so + // the top-level wrapper routes explicitly to dense here. + let n = y.nrows(); + let builder = crate::staar::kinship::dense::DenseBuilder { + n, + kinships, + groups, + weights, + }; + let mut state = run_reml_constrained_inner( + y, + x, + kinships, + groups, + weights, + init_tau, + triples, + &builder, + )?; + state.budget_bytes = budget_bytes; + Ok(state) +} + +#[allow(clippy::too_many_arguments)] +fn run_reml_constrained_inner( + y: &Mat, + x: &Mat, + kinships: &[KinshipMatrix], + groups: &GroupPartition, + weights: &[f64], + init_tau: VarianceComponents, + triples: &[CovarianceIdx], + builder: &B, ) -> Result { let l = kinships.len(); let g = groups.n_groups(); let n_comp = l + g; + let mut cov_slot = vec![false; n_comp]; + for t in triples { + debug_assert!(t.cov_idx < n_comp); + cov_slot[t.cov_idx] = true; + } + let mut fixtau = vec![false; n_comp]; let mut state = converge( y, @@ -743,13 +873,20 @@ pub fn run_reml( weights, init_tau, &fixtau, + triples, builder, )?; for refit_iter in 0..REML_MAX_OUTER_REFITS { let mut changed = false; for (i, fixed) in fixtau.iter_mut().enumerate().take(n_comp) { - if !*fixed && state.tau.as_slice()[i] < BOUNDARY_FACTOR * REML_TOL { + if *fixed || cov_slot[i] { + // Never fix-at-boundary for covariance slots — they can + // legitimately sit near zero on the way to a larger + // estimate and the PSD box handles their boundary. + continue; + } + if state.tau.as_slice()[i] < BOUNDARY_FACTOR * REML_TOL { *fixed = true; changed = true; } @@ -772,6 +909,7 @@ pub fn run_reml( weights, tau_refit, &fixtau, + triples, builder, )?; state.outer_refits = refit_iter + 1; diff --git a/src/staar/kinship/types.rs b/src/staar/kinship/types.rs index ae46369..517f00d 100644 --- a/src/staar/kinship/types.rs +++ b/src/staar/kinship/types.rs @@ -223,6 +223,17 @@ impl GroupPartition { } } + /// No per-group residual variance. Used by the multi-trait kinship path + /// (GMMAT `glmmkin.R:424-536::glmmkin.multi.ai`) where every variance + /// component lives in the expanded kinship list as `E_jk ⊗ V` blocks + /// and the residual term is itself one of those kinship entries. + pub fn empty(n: usize) -> Self { + Self { + groups: Vec::new(), + n, + } + } + /// Build from a per-sample group-index assignment plus the group label /// table. Errors on out-of-range indices, empty groups, or zero samples. /// Labels are consumed for validation messages but not retained. @@ -275,6 +286,19 @@ impl GroupPartition { } } +/// PSD-box constraint for random-slope longitudinal LMM. Ports +/// `covariance.idx` from `GMMAT/R/glmmkin.R:175-183`. Indices are into +/// the flat `τ` layout used by `fit_reml_random_slope` +/// (kinship slots 0..3L, group slots 3L..3L+G). The covariance-slope +/// matrix `D = [[τ_int, τ_cov], [τ_cov, τ_slope]]` is PSD iff +/// `τ_cov² ≤ τ_int · τ_slope`. +#[derive(Clone, Copy, Debug)] +pub struct CovarianceIdx { + pub cov_idx: usize, + pub var_int_idx: usize, + pub var_slope_idx: usize, +} + /// Variance components for AI-REML, laid out `[τ_kinship_1..τ_kinship_L, /// τ_group_1..τ_group_G]`. The kinship/group split is carried with the data /// so accessors are typed and indices cannot be confused. diff --git a/src/staar/mod.rs b/src/staar/mod.rs index c79af09..61bdbec 100644 --- a/src/staar/mod.rs +++ b/src/staar/mod.rs @@ -12,6 +12,7 @@ pub mod masks; pub mod meta; pub mod model; pub mod multi; +pub mod multi_kinship; pub mod output; pub mod pipeline; pub mod run_manifest; @@ -37,10 +38,13 @@ pub enum RunMode { Analyze, /// Stop after the null model and dump per-variant U/K for MetaSTAAR. EmitSumstats, - /// Joint multi-trait STAAR for unrelated continuous traits with shared - /// covariates. Activated when `--trait-name` carries more than one name. - /// Rejects `--spa`, `--ancestry-col`, `--kinship`, and `--emit-sumstats` - /// at config build time; kinship-aware joint nulls are a separate track. + /// Joint multi-trait STAAR for unrelated traits with shared + /// covariates. Gaussian uses the MultiSTAAR Kronecker null; binomial + /// uses the lift of single-trait logistic + MultiSTAAR Kronecker + /// (`MultiNull::fit_binary`). Activated when `--trait-name` carries + /// more than one name. Rejects `--spa`, `--ancestry-col`, `--kinship`, + /// and `--emit-sumstats` at config build time; kinship-aware joint + /// nulls are a separate track. MultiTrait, } diff --git a/src/staar/model.rs b/src/staar/model.rs index e38a314..c549859 100644 --- a/src/staar/model.rs +++ b/src/staar/model.rs @@ -121,7 +121,7 @@ pub(crate) fn arrow_str(col: &dyn Array, row: usize) -> String { } /// Extract a f64 from any Arrow numeric column (handles Float64, Int64, etc.). -fn arrow_f64(col: &dyn Array, row: usize) -> f64 { +pub(crate) fn arrow_f64(col: &dyn Array, row: usize) -> f64 { if col.is_null(row) { return f64::NAN; } if let Some(a) = col.as_any().downcast_ref::() { return a.value(row); @@ -177,10 +177,12 @@ pub fn load_phenotype( resolved_covs.push(resolved); } - // Trait type detection runs against the first resolved column. Joint - // multi-trait STAAR is unrelated continuous only, so a binary first - // trait is a hard error in multi mode; single-trait keeps the existing - // continuous-vs-binary dispatch. + // Trait type detection runs against the first resolved column and + // applies to the entire joint multi-trait run. Multi-trait joint STAAR + // now supports all-binary traits via per-trait logistic IRLS + + // cross-trait residual correlation (see `multi::MultiNull::fit_binary`); + // we enforce that every trait in a multi-trait run shares the same + // detected family, since the joint null cannot mix gaussian and binomial. let primary_trait = &trait_cols[0]; let distinct_count = engine.query_scalar(&format!( "SELECT COUNT(DISTINCT \"{primary_trait}\") FROM _pheno" @@ -191,12 +193,25 @@ pub fn load_phenotype( staar::TraitType::Continuous }; out.status(&format!(" Trait '{primary_trait}' -> {:?}", trait_type)); - if k_traits > 1 && trait_type == staar::TraitType::Binary { - return Err(CohortError::Input(format!( - "Multi-trait joint STAAR requires continuous traits, but trait \ - '{primary_trait}' has only {distinct_count} distinct values. \ - Run each trait separately or drop the binary trait." - ))); + if k_traits > 1 { + for extra in &trait_cols[1..] { + let dc = engine.query_scalar(&format!( + "SELECT COUNT(DISTINCT \"{extra}\") FROM _pheno" + ))?; + let tt = if dc <= 2 { + staar::TraitType::Binary + } else { + staar::TraitType::Continuous + }; + if tt != trait_type { + return Err(CohortError::Input(format!( + "Multi-trait joint STAAR requires every --trait-name to \ + share the same family, but '{primary_trait}' is {:?} \ + and '{extra}' is {:?}. Run each family separately.", + trait_type, tt, + ))); + } + } } let id_col = resolve_id_column(&pheno_cols, column_map); @@ -911,13 +926,12 @@ mod tests { } } - /// Joint multi-trait STAAR is unrelated continuous only. If the first - /// resolved trait is detected as Binary (<= 2 distinct values) the - /// loader must reject with an Input error naming the offending column. + /// Joint multi-trait STAAR requires every --trait-name to share the + /// same family. Mixing a binary trait and a continuous trait in the + /// same run must reject with an Input error naming both columns. #[test] - fn load_phenotype_rejects_binary_trait_in_multi_mode() { + fn load_phenotype_rejects_mixed_family_in_multi_mode() { let dir = tempfile::tempdir().unwrap(); - // BMI here is coerced to {0.0, 1.0} so distinct_count == 2. let rows: Vec> = (1..=12) .map(|i| { let id = format!("s{i}"); @@ -964,18 +978,80 @@ mod tests { &SilentOut, ); match result { - Ok(_) => panic!("binary first trait + k>1 must be rejected"), + Ok(_) => panic!("mixed binary + continuous family must be rejected"), Err(CohortError::Input(msg)) => { - assert!(msg.contains("BMI"), "error must name the binary column: {msg}"); + assert!(msg.contains("BMI"), "error must name BMI: {msg}"); + assert!(msg.contains("HEIGHT"), "error must name HEIGHT: {msg}"); assert!( - msg.contains("continuous"), - "error must explain the continuous-only constraint: {msg}" + msg.contains("family"), + "error must explain the shared-family rule: {msg}" ); } Err(other) => panic!("expected Input error, got {other:?}"), } } + /// Multi-trait all-binary must now load successfully and produce a + /// `(n, k)` y matrix with binary-coded phenotypes. + #[test] + fn load_phenotype_accepts_all_binary_multi() { + let dir = tempfile::tempdir().unwrap(); + // Two binary traits: case1 and case2, each balanced 0/1. + let rows: Vec> = (1..=12) + .map(|i| { + let id = format!("s{i}"); + let case1 = (i % 2) as f64; + let case2 = ((i + 1) % 2) as f64; + let age = 30.0 + i as f64; + let sex = (i % 2) as f64; + vec![ + id, + format!("{case1}"), + format!("{case2}"), + format!("{age}"), + format!("{sex}"), + ] + }) + .collect(); + let row_refs: Vec> = rows + .iter() + .map(|r| r.iter().map(|s| s.as_str()).collect()) + .collect(); + let pheno_path = write_pheno_tsv( + &dir, + &["IID", "case1", "case2", "age", "sex"], + &row_refs, + ); + let sample_ids: Vec = (1..=12).map(|i| format!("s{i}")).collect(); + let geno = fake_geno( + &sample_ids + .iter() + .map(|s| s.as_str()) + .collect::>(), + ); + + let pheno = load_phenotype( + &test_engine(), + &pheno_path, + &["age".into(), "sex".into()], + &geno, + &["case1".into(), "case2".into()], + None, + 5, + 42, + &HashMap::new(), + &SilentOut, + ) + .expect("all-binary multi must load"); + assert_eq!(pheno.trait_type, staar::TraitType::Binary); + assert_eq!(pheno.y.ncols(), 2); + assert_eq!(pheno.y.nrows(), 12); + for i in 0..12 { + assert!(pheno.y[(i, 0)] == 0.0 || pheno.y[(i, 0)] == 1.0); + assert!(pheno.y[(i, 1)] == 0.0 || pheno.y[(i, 1)] == 1.0); + } + } + /// Regression: the k=1 path must stay identical. A single continuous /// trait loads as a `(n, 1)` Y with the same values the old /// `&str`-taking loader would have produced. diff --git a/src/staar/multi.rs b/src/staar/multi.rs index f87bd1c..c0241f0 100644 --- a/src/staar/multi.rs +++ b/src/staar/multi.rs @@ -4,7 +4,7 @@ #![allow(clippy::needless_range_loop)] //! Joint multi-trait STAAR (Li et al. 2023, MultiSTAAR) for unrelated -//! continuous traits with shared covariates. +//! traits with shared covariates. //! //! Upstream `MultiSTAAR_O_SMMAT.cpp` materialises a stacked genotype //! `G_big = I_k ⊗ G₀` and a dense `(n·k × n·k)` projection `P` from GMMAT, @@ -21,6 +21,26 @@ //! `(m × k)` and `K₀` `(m × m)`. The joint chi-square tests this module //! produces are bit-equivalent to the upstream cpp on the unrelated //! continuous path; they reduce exactly to single-trait STAAR when k=1. +//! +//! **Binary extension.** GMMAT `glmmkin.R:43` rejects non-gaussian families +//! in the multi-pheno path, so MultiSTAAR has no upstream joint-binary null +//! fitter. Our `fit_binary` is a lift that combines two upstreams: +//! per-trait logistic IRLS (GMMAT single-trait binomial, mirrored in +//! `model::fit_logistic`) with MultiSTAAR's gaussian Kronecker pattern. +//! Concretely we fit each of k traits independently via single-trait +//! logistic IRLS, form R = Y − μ and take `Σ_res` as the cross-trait +//! residual **correlation** with unit diagonal (binomial dispersion is 1, +//! so using the raw covariance would shift k=1 off single-trait parity), +//! and use a shared working weight `W̄ = (1/k) Σ_j W_j` to build the +//! weighted projection `K₀ = G₀'(W̄ − W̄X(X'W̄X)⁻¹X'W̄)G₀`, i.e. the +//! single-trait Fisher-information kernel at the mean working weight. At +//! k=1 this reduces exactly to single-trait logistic: `Σ_res = [1]`, +//! `W̄ = W_1`, `K₀ = K_single_trait`. For k > 1 with unequal per-trait +//! weights this is a **shared-W approximation**: the joint covariance is +//! approximated by `Σ_res_corr ⊗ K̄` instead of the block-varying +//! `blockdiag(K_j)` a per-trait-weighted formulation would produce. This +//! preserves the Kronecker hot path and is the pragmatic first cut in the +//! absence of an upstream reference. use faer::prelude::*; use faer::Mat; @@ -28,16 +48,23 @@ use faer::Mat; use super::score::{beta_density_weight, StaarResult}; use super::stats; -/// Null model fit for unrelated continuous multi-trait STAAR. +/// Null model fit for unrelated multi-trait STAAR, continuous or binary. /// -/// Holds the per-trait residual matrix and the cross-trait residual -/// covariance, plus the shared covariate matrix and `(X'X)⁻¹` so we can -/// reconstruct `K₀` cheaply for any gene. -pub struct MultiNullContinuous { - /// `R = M Y`, shape `(n, k)`. Per-trait OLS residuals against the - /// shared covariate matrix. +/// The gaussian path stores OLS residuals against the shared covariate +/// matrix plus `(X'X)⁻¹`. The binary path stores per-trait IRLS residuals +/// `Y − μ`, the mean working-weight vector, and `(X'W̄X)⁻¹`. Which path +/// produced the fit is recorded in `working_weights`: `None` for gaussian, +/// `Some(W̄)` for binary. Every hot-path function branches on this one +/// field; all other fields keep the same Kronecker semantics. +pub struct MultiNull { + /// Gaussian: `R = M Y`, shape `(n, k)`. Binary: `R = Y − μ` per trait, + /// raw score residuals (not working responses). pub residuals: Mat, - /// `Σ_res = R'R / (n − p)`, the unbiased residual covariance. + /// Gaussian: `Σ_res = R'R / (n − p)`, the unbiased residual covariance. + /// Binary: `Σ_res` is the cross-trait residual **correlation** with + /// unit diagonal. The unit diagonal keeps k=1 binary aligned with + /// single-trait logistic (binomial dispersion is 1), and preserves the + /// Kronecker structure `Σ_res ⊗ K̄` for the joint tests. pub sigma_res: Mat, /// `Σ_res⁻¹`, pre-computed for the hot path. pub sigma_inv: Mat, @@ -46,13 +73,28 @@ pub struct MultiNullContinuous { pub sigma_inv_eigvals: Vec, /// Shared covariate matrix, `(n, p)`. pub x: Mat, - /// `(X'X)⁻¹`, `(p, p)`. + /// Gaussian: `(X'X)⁻¹`. Binary: `(X'W̄X)⁻¹` with `W̄ = (1/k) Σ_j W_j`. + /// Used by `projected_kernel` to build the single-trait projected + /// kernel `K₀` from any gene's `G₀`. pub xtx_inv: Mat, + /// Per-sample mean working weight `W̄_i = (1/k) Σ_j μ_{ij}(1 − μ_{ij})` + /// for binary traits, populated by `fit_binary`. `None` on the + /// gaussian path. When `Some`, `projected_kernel` uses the + /// Fisher-information form `K₀ = G₀'(W̄ − W̄X(X'W̄X)⁻¹X'W̄)G₀`; when + /// `None`, it uses the OLS hat-removing form `K₀ = G₀'(I − H)G₀`. + pub working_weights: Option>, + /// Converged multi-trait AI-REML state when kinship is in use + /// (`--kinship` on the CLI). `None` on both the unrelated gaussian + /// and unrelated binary paths. When `Some`, `run_multi_staar` + /// dispatches to the kinship-aware score kernel in + /// `crate::staar::multi_kinship`, which contracts against the full + /// `(mk × mk)` joint covariance. + pub kinship: Option, pub n_samples: usize, pub n_pheno: usize, } -impl MultiNullContinuous { +impl MultiNull { /// Fit the multivariate linear model `vec(Y) = (I_k ⊗ X) β + ε`, /// `ε ~ N(0, Σ_res ⊗ I_n)` for unrelated continuous traits with a /// shared covariate matrix `X`. @@ -87,10 +129,137 @@ impl MultiNullContinuous { sigma_inv_eigvals, x: x.clone(), xtx_inv, + working_weights: None, + kinship: None, + n_samples: n, + n_pheno: k, + } + } + + /// Fit k binary traits jointly via per-trait logistic IRLS + empirical + /// cross-trait residual correlation. See the module-level docstring + /// for the derivation. At k=1 this is bit-equivalent to + /// `model::fit_logistic`: `Σ_res = [1]`, `W̄ = W_1`, `xtx_inv = (X'W_1 X)⁻¹`. + pub fn fit_binary(y: &Mat, x: &Mat, max_iter: usize) -> Self { + let n = y.nrows(); + let k = y.ncols(); + let p = x.ncols(); + assert_eq!(x.nrows(), n, "X rows must match Y rows"); + assert!(n > p + k, "need n > p + k samples"); + + let mut residuals = Mat::::zeros(n, k); + let mut w_bar = vec![0.0_f64; n]; + let mut y_col = Mat::::zeros(n, 1); + for j in 0..k { + for i in 0..n { + y_col[(i, 0)] = y[(i, j)]; + } + let fit = super::model::fit_logistic(&y_col, x, max_iter); + for i in 0..n { + residuals[(i, j)] = fit.residuals[(i, 0)]; + } + let w_j = fit + .working_weights + .as_ref() + .expect("fit_logistic sets working_weights"); + for i in 0..n { + w_bar[i] += w_j[i]; + } + } + let inv_k = 1.0 / k as f64; + for i in 0..n { + w_bar[i] *= inv_k; + } + + // Cross-trait residual correlation with unit diagonal. Using the + // empirical covariance R'R/(n-p) directly would give `Σ_res[j,j] = + // mean(μ_j(1-μ_j))` for k=1 binary, breaking parity with + // single-trait logistic (which assumes dispersion = 1). The unit + // diagonal captures the fact that binomial dispersion is fixed; the + // off-diagonal correlation captures the cross-trait coupling. + let cov = (residuals.transpose() * &residuals) * (1.0 / (n - p) as f64); + let mut sigma_res = Mat::::zeros(k, k); + let mut d_inv = vec![0.0_f64; k]; + for j in 0..k { + let v = cov[(j, j)].max(1e-30); + d_inv[j] = 1.0 / v.sqrt(); + } + for a in 0..k { + for b in 0..k { + sigma_res[(a, b)] = cov[(a, b)] * d_inv[a] * d_inv[b]; + } + } + // Numerical guard: force exact unit diagonal so sigma_inv at k=1 + // is exactly 1.0. + for a in 0..k { + sigma_res[(a, a)] = 1.0; + } + + let eye_k = Mat::::identity(k, k); + let sigma_inv = sigma_res.col_piv_qr().solve(&eye_k); + let sigma_inv_eigvals = symmetric_eigenvalues(&sigma_inv); + + // `(X'W̄X)⁻¹` — the weighted normal-equation inverse at the mean + // working weight. Identical to single-trait `fit_logistic` when k=1. + let mut xtwx: Mat = Mat::zeros(p, p); + for i in 0..n { + let wi = w_bar[i].max(1e-30); + for a in 0..p { + for b in 0..p { + xtwx[(a, b)] += x[(i, a)] * wi * x[(i, b)]; + } + } + } + let eye_p = Mat::::identity(p, p); + let xtx_inv = xtwx.col_piv_qr().solve(&eye_p); + + Self { + residuals, + sigma_res, + sigma_inv, + sigma_inv_eigvals, + x: x.clone(), + xtx_inv, + working_weights: Some(w_bar), + kinship: None, n_samples: n, n_pheno: k, } } + + /// Fit the joint multi-trait GLMM with shared kinship matrices. + /// Wraps `multi_kinship::fit_multi_kinship` and packages the state + /// into a `MultiNull` so the pipeline's single dispatch point can + /// consume any of the three variants uniformly. The no-kinship + /// fields on `MultiNull` are populated with placeholder values in + /// this variant (the score path routes through `kinship` and never + /// reads them). + pub fn fit_with_kinship( + y: &Mat, + x: &Mat, + kinships: &[crate::staar::kinship::types::KinshipMatrix], + groups: &crate::staar::kinship::types::GroupPartition, + ) -> Result { + let state = crate::staar::multi_kinship::fit_multi_kinship(y, x, kinships, groups)?; + let n = y.nrows(); + let k = y.ncols(); + Ok(Self { + // Placeholders for the kinship path. `run_multi_staar` + // branches on `kinship.is_some()` and never reads these + // fields; they exist to keep the struct layout uniform so + // callers that only want `n_samples`/`n_pheno` still work. + residuals: Mat::::zeros(n, k), + sigma_res: Mat::::identity(k, k), + sigma_inv: Mat::::identity(k, k), + sigma_inv_eigvals: vec![1.0; k], + x: x.clone(), + xtx_inv: Mat::::zeros(x.ncols(), x.ncols()), + working_weights: None, + kinship: Some(state), + n_samples: n, + n_pheno: k, + }) + } } /// Per-gene score statistics for joint multi-trait scoring. @@ -137,18 +306,21 @@ impl MultiScratch { } /// Build `(S, K₀)` for one gene. -pub fn gene_stats(g0: &Mat, null: &MultiNullContinuous) -> GeneStats { +pub fn gene_stats(g0: &Mat, null: &MultiNull) -> GeneStats { let n = g0.nrows(); let m = g0.ncols(); assert_eq!(n, null.n_samples, "G rows must match null samples"); let s = g0.transpose() * &null.residuals; - let k0 = projected_kernel(g0, &null.x, &null.xtx_inv, m); + let k0 = match null.working_weights.as_ref() { + None => projected_kernel_ols(g0, &null.x, &null.xtx_inv, m), + Some(w) => projected_kernel_weighted(g0, &null.x, &null.xtx_inv, w, m), + }; GeneStats { s, k0 } } /// `K₀ = G₀'(I − X(X'X)⁻¹X')G₀ = G₀' G₀ − (G₀' X)(X'X)⁻¹(X' G₀)`. -fn projected_kernel(g: &Mat, x: &Mat, xtx_inv: &Mat, m: usize) -> Mat { +fn projected_kernel_ols(g: &Mat, x: &Mat, xtx_inv: &Mat, m: usize) -> Mat { let gtx = g.transpose() * x; let inner = >x * xtx_inv; let correction = &inner * gtx.transpose(); @@ -161,6 +333,37 @@ fn projected_kernel(g: &Mat, x: &Mat, xtx_inv: &Mat, m: usize) -> k } +/// `K₀ = G₀'(W̄ − W̄X(X'W̄X)⁻¹X'W̄)G₀` for the binary path, using a shared +/// per-sample mean working weight `W̄`. Matches single-trait +/// `NullModel::compute_kernel` at k=1. +fn projected_kernel_weighted( + g: &Mat, + x: &Mat, + xtwx_inv: &Mat, + w: &[f64], + m: usize, +) -> Mat { + let n = g.nrows(); + debug_assert_eq!(w.len(), n); + let mut wg = Mat::::zeros(n, m); + for j in 0..m { + for i in 0..n { + wg[(i, j)] = w[i] * g[(i, j)]; + } + } + let gtwg = g.transpose() * &wg; + let xtwg = x.transpose() * &wg; + let inner = xtwx_inv * &xtwg; + let correction = wg.transpose() * (x * &inner); + let mut k = gtwg; + for i in 0..m { + for j in 0..m { + k[(i, j)] -= correction[(i, j)]; + } + } + k +} + /// Joint per-variant common test: /// /// q_i = (1 / k0[i,i]) · S[i,:] Σ_res⁻¹ S[i,:]ᵀ ~ χ²(k_pheno) @@ -353,7 +556,7 @@ pub fn multi_acat_v( /// so the existing writers and report layer keep working. pub fn run_multi_staar( g0: &Mat, - null: &MultiNullContinuous, + null: &MultiNull, annotation_matrix: &[Vec], mafs: &[f64], ) -> StaarResult { @@ -361,6 +564,14 @@ pub fn run_multi_staar( if m == 0 { return nan_result(); } + if let Some(ks) = null.kinship.as_ref() { + return crate::staar::multi_kinship::run_multi_staar_kinship( + g0, + ks, + annotation_matrix, + mafs, + ); + } let stats = gene_stats(g0, null); let mut scratch = MultiScratch::with_capacity(m, null.n_pheno); multi_tests(&stats, null, annotation_matrix, mafs, &mut scratch) @@ -368,7 +579,7 @@ pub fn run_multi_staar( fn multi_tests( stats: &GeneStats, - null: &MultiNullContinuous, + null: &MultiNull, annotation_matrix: &[Vec], mafs: &[f64], scratch: &mut MultiScratch, @@ -737,7 +948,7 @@ mod tests { let x = intercept_x(n); let single_null = model::fit_glm(&y_single, &x); - let multi_null = MultiNullContinuous::fit(&y_single, &x); + let multi_null = MultiNull::fit(&y_single, &x); assert_eq!(multi_null.n_pheno, 1); let single_p = score::run_staar(&g, &[], &mafs, &single_null, false); @@ -778,7 +989,7 @@ mod tests { .collect(); let single_null = model::fit_glm(&y, &x); - let multi_null = MultiNullContinuous::fit(&y, &x); + let multi_null = MultiNull::fit(&y, &x); let single_p = score::run_staar(&g, &ann, &mafs, &single_null, false); let multi_p = run_multi_staar(&g, &multi_null, &ann, &mafs); @@ -814,7 +1025,7 @@ mod tests { } let x = intercept_x(n); - let null = MultiNullContinuous::fit(&y, &x); + let null = MultiNull::fit(&y, &x); assert_eq!(null.n_pheno, 2); assert_eq!(null.sigma_res.nrows(), 2); assert_eq!(null.sigma_inv_eigvals.len(), 2); @@ -842,7 +1053,7 @@ mod tests { } let x = intercept_x(n); - let null = MultiNullContinuous::fit(&y, &x); + let null = MultiNull::fit(&y, &x); // Off-diagonal of Σ_res should be positive given the y2 construction. assert!(null.sigma_res[(0, 1)] > 0.0); @@ -855,4 +1066,111 @@ mod tests { assert!(result.acat_o.is_finite()); assert_eq!(result.per_annotation.len(), 2); } + + /// Deterministic Bernoulli draws from xorshift so the binary tests + /// stay reproducible without pulling in a heavy RNG dep. + fn random_binary(n: usize, p: f64, seed: u64) -> Mat { + let mut state = seed.max(1); + let mut next_u01 = || { + state ^= state << 13; + state ^= state >> 7; + state ^= state << 17; + (state >> 11) as f64 / (1u64 << 53) as f64 + }; + let mut y = Mat::::zeros(n, 1); + for i in 0..n { + y[(i, 0)] = if next_u01() < p { 1.0 } else { 0.0 }; + } + y + } + + /// k=1 binary must reduce to single-trait logistic. `MultiNull::fit_binary` + /// with k=1 shares the same IRLS, residuals, and `(X'WX)⁻¹`, so the + /// per-test p-values should match to numerical tolerance (floating + /// point path differs slightly between direct and Kronecker-routed + /// kernels, but well below any analytically meaningful threshold). + #[test] + fn k1_binary_matches_single_trait_logistic() { + let n = 300; + let m = 8; + let mafs = vec![0.01; m]; + let g = random_genotypes(n, m, &mafs, 53); + + let y = random_binary(n, 0.3, 59); + let x = intercept_x(n); + + let single_null = model::fit_logistic(&y, &x, 25); + let multi_null = MultiNull::fit_binary(&y, &x, 25); + assert_eq!(multi_null.n_pheno, 1); + assert!(multi_null.working_weights.is_some()); + assert!((multi_null.sigma_res[(0, 0)] - 1.0).abs() < 1e-12); + + let single_p = score::run_staar(&g, &[], &mafs, &single_null, false); + let multi_p = run_multi_staar(&g, &multi_null, &[], &mafs); + + let tol = 1e-8; + assert!( + (single_p.burden_1_25 - multi_p.burden_1_25).abs() < tol, + "burden(1,25) single={} multi={}", + single_p.burden_1_25, multi_p.burden_1_25, + ); + assert!((single_p.burden_1_1 - multi_p.burden_1_1).abs() < tol); + assert!((single_p.skat_1_25 - multi_p.skat_1_25).abs() < tol); + assert!((single_p.skat_1_1 - multi_p.skat_1_1).abs() < tol); + assert!((single_p.acat_v_1_25 - multi_p.acat_v_1_25).abs() < tol); + assert!((single_p.acat_v_1_1 - multi_p.acat_v_1_1).abs() < tol); + assert!((single_p.acat_o - multi_p.acat_o).abs() < tol); + assert!((single_p.staar_o - multi_p.staar_o).abs() < tol); + } + + /// k=2 binary omnibus runs and emits a valid p-value. Exercises the + /// shared-W̄ projected kernel and the residual-correlation coupling. + #[test] + fn k2_binary_omnibus_runs_and_is_valid_p() { + let n = 400; + let m = 7; + let mafs = vec![0.005, 0.008, 0.003, 0.006, 0.002, 0.004, 0.007]; + let g = random_genotypes(n, m, &mafs, 67); + + // Correlated binaries: y2 = y1 xor coin(0.3) to induce dependence. + let y1 = random_binary(n, 0.35, 71); + let mut state: u64 = 73; + let mut next_u01 = || { + state ^= state << 13; + state ^= state >> 7; + state ^= state << 17; + (state >> 11) as f64 / (1u64 << 53) as f64 + }; + let mut y = Mat::::zeros(n, 2); + for i in 0..n { + y[(i, 0)] = y1[(i, 0)]; + let flip = if next_u01() < 0.3 { 1.0 } else { 0.0 }; + let xor = if (y1[(i, 0)] == 1.0) ^ (flip == 1.0) { + 1.0 + } else { + 0.0 + }; + y[(i, 1)] = xor; + } + let x = intercept_x(n); + + let null = MultiNull::fit_binary(&y, &x, 25); + assert_eq!(null.n_pheno, 2); + assert!(null.working_weights.is_some()); + // Unit diagonal by construction; off-diagonal is the cross-trait + // residual correlation. We only assert finiteness here since the + // exact value is a function of the xorshift stream. + assert!((null.sigma_res[(0, 0)] - 1.0).abs() < 1e-12); + assert!((null.sigma_res[(1, 1)] - 1.0).abs() < 1e-12); + assert!(null.sigma_res[(0, 1)].is_finite()); + + let ann: Vec> = + (0..2).map(|_c| (0..m).map(|j| 0.5 + 0.1 * j as f64).collect()).collect(); + let result = run_multi_staar(&g, &null, &ann, &mafs); + + assert!(result.staar_o.is_finite()); + assert!((0.0..=1.0).contains(&result.staar_o)); + assert!(result.acat_o.is_finite()); + assert_eq!(result.per_annotation.len(), 2); + } } diff --git a/src/staar/multi_kinship.rs b/src/staar/multi_kinship.rs new file mode 100644 index 0000000..d4f374b --- /dev/null +++ b/src/staar/multi_kinship.rs @@ -0,0 +1,1405 @@ +// `needless_range_loop` is silenced because the expansion helpers and +// score-cov assembly read more naturally as index loops than iterator +// chains over `Mat` rows — matches `multi.rs`. +#![allow(clippy::needless_range_loop)] + +//! Multi-trait AI-REML with shared kinship matrices (related samples). +//! +//! Ports `glmmkin.multi.ai` from upstream GMMAT +//! (`hanchenphd/GMMAT@473b342 R/glmmkin.R:424-536`) for the `family = +//! gaussian` multi-pheno path that `fit_null_glmmkin_multi` wraps. The +//! upstream wrapper itself lives at +//! `xihaoli/MultiSTAAR@main R/fit_null_glmmkin_multi.R:70-123`. +//! +//! ## Variance-component parameterisation +//! +//! For `k` continuous traits with shared covariates `X` (shape `n × p`), +//! `L` kinship matrices `Φ_1..Φ_L` (shape `n × n`), and `G ≥ 1` residual +//! groups covering `{0..n}`, the joint covariance of `vec(Y)` (length +//! `nk`) under the null is +//! +//! `Σ = Σ_{g=1}^G Σ_res_g ⊗ D_g + Σ_{l=1}^L Σ_g_l ⊗ Φ_l` +//! +//! where `D_g` is the diagonal indicator of group `g` (so `Σ_g D_g = I_n` +//! in the homoscedastic case) and each `Σ_res_g`, `Σ_g_l` is a symmetric +//! `(k × k)` matrix of variance/covariance components. +//! +//! Upstream's trick (`glmmkin.R:147-166`) expands this into a flat list +//! of scalar variance components by walking every symmetric `(k × k)` +//! matrix entry: each `(j, m)` pair with `j ≤ m` contributes one +//! parameter `τ_{j,m}` and one expanded kinship entry `E_{j,m} ⊗ V`, +//! where `E_{j,m}` is the `(k × k)` symmetric matrix with a single `1` +//! at `(j, m)` and `(m, j)`. The AI-REML algorithm runs on this flat +//! scalar-tau list exactly like the single-trait path; at convergence, +//! upstream (`glmmkin.R:213-224`) reshapes the flat vector back into +//! per-group and per-kinship `(k × k)` symmetric matrices. +//! +//! Total parameter count: `n_params = k(k+1)/2 · (G + L)`. +//! +//! ## What this module adds +//! +//! * `expand_kinships` — builds the `n_params` expanded kinship matrices +//! `E_{j,m} ⊗ D_g` and `E_{j,m} ⊗ Φ_l`. +//! * `stack_response` — `vec(Y)` as an `(nk × 1)` column. +//! * `stack_covariates` — `I_k ⊗ X` as an `(nk × kp)` dense matrix +//! (block-diagonal with `k` copies of `X` on the diagonal). +//! * `MultiKinshipState` — converged fit: stacked `Σ⁻¹`, `Σ⁻¹X_stacked`, +//! `cov = (X_stacked' Σ⁻¹ X_stacked)⁻¹`, stacked `PY`, plus the +//! reshaped `(k × k)` per-group and per-kinship component matrices. +//! * `fit_multi_kinship` — runs AI-REML on the expanded list by reusing +//! `super::kinship::reml::run_reml` with `GroupPartition::empty`, +//! since every variance component now lives in the kinship list. +//! +//! ## Score-test path (gene_stats kinship branch) +//! +//! For a gene `G₀` (shape `n × m`), the stacked score is +//! +//! `U_stacked = (I_k ⊗ G₀)' · PY_stacked` shape `mk × 1` +//! +//! which unstacks to `S = G₀' · PY_reshaped` of shape `m × k`, the same +//! layout the no-kinship branch uses. The joint score covariance +//! *does not* factor as `Σ_res⁻¹ ⊗ K₀` — the kinship term breaks the +//! Kronecker structure of `Σ⁻¹`. Instead we compute the full +//! `(mk × mk)` covariance per gene (upstream +//! `MultiSTAAR/src/MultiSTAAR_O_SMMAT_sparse.cpp:77`): +//! +//! `Cov = (I_k ⊗ G₀)' Σ⁻¹ (I_k ⊗ G₀) − A' · cov · A` +//! +//! where `A = Σ⁻¹X_stacked' · (I_k ⊗ G₀)` has shape `kp × mk`. The per- +//! variant and per-test statistics then contract against this full +//! covariance instead of the Kronecker factor. +//! +//! ## k=1 parity +//! +//! At `k=1`, every `E_{0,0}` equals `[[1]]`, the expanded kinship list +//! is `{D_g}_g ∪ {Φ_l}_l` — exactly the single-trait kinship+groups +//! layout — and the stacked response/covariates collapse to the raw +//! `y`/`X`. `MultiKinshipState.sigma_inv`, `.sigma_inv_x`, `.cov`, +//! `.p_y` coincide with `KinshipState` fields bit-for-bit, and the +//! per-gene `(mk × mk)` cov reduces to the usual `(m × m)` `K₀`. The +//! unit test `k1_kinship_matches_single_trait_reml` pins this. + +use faer::prelude::Solve; +use faer::sparse::Triplet; +use faer::Mat; + +use super::kinship::reml::run_reml; +use super::kinship::types::{ + GroupPartition, KinshipInverse, KinshipMatrix, KinshipState, VarianceComponents, +}; +use crate::error::CohortError; + +/// Upper-triangle index enumeration for a `k × k` symmetric matrix: +/// returns the `(j, m)` pairs with `j ≤ m` in the same order upstream +/// GMMAT uses (`glmmkin.R:150-153`). For `k=2`: `[(0,0), (0,1), (1,1)]`. +fn sym_pairs(k: usize) -> Vec<(usize, usize)> { + let mut v = Vec::with_capacity(k * (k + 1) / 2); + for j in 0..k { + for m in j..k { + v.push((j, m)); + } + } + v +} + +/// Build the expanded kinship list for multi-trait AI-REML. +/// +/// Output order (matches `glmmkin.R:147-166`): first the residual-group +/// entries `E_{j,m} ⊗ D_g` in `(group, (j,m))` lexicographic order, then +/// the kinship entries `E_{j,m} ⊗ Φ_l` in `(kinship, (j,m))` order. +/// +/// Every output matrix is `(nk × nk)` sparse. The per-entry non-zero +/// count is `2 · nnz(V)` for `j < m` pairs (two Kronecker blocks: `(j,m)` +/// and `(m,j)`) and `nnz(V)` for `j == m` pairs (one diagonal block). +pub fn expand_kinships( + kinships: &[KinshipMatrix], + groups: &GroupPartition, + n_pheno: usize, +) -> Result, CohortError> { + let k = n_pheno; + let n = groups.n_samples(); + let pairs = sym_pairs(k); + let n_groups = groups.n_groups().max(1); + let n_entries = pairs.len() * (n_groups + kinships.len()); + let mut out: Vec = Vec::with_capacity(n_entries); + + // Residual-group block `E_{j,m} ⊗ D_g`. Homoscedastic case (no + // explicit groups) collapses to `E_{j,m} ⊗ I_n`. + let group_sets: Vec> = if groups.n_groups() == 0 { + vec![(0..n as u32).collect()] + } else { + (0..groups.n_groups()) + .map(|g| groups.group(g).to_vec()) + .collect() + }; + + for (gi, rows) in group_sets.iter().enumerate() { + for &(j, m) in &pairs { + let label = format!("res_g{gi}_{j}_{m}"); + let triplets = build_block_triplets(n, k, j, m, |i| { + // D_g at (i,i) = 1 if i is in group gi, 0 otherwise. + // We only emit diagonal triplets for in-group samples. + let _ = i; + Some(1.0) + }, rows); + out.push(KinshipMatrix::from_triplets(n * k, triplets, label)?); + } + } + + // Kinship block `E_{j,m} ⊗ Φ_l`. + for (li, phi) in kinships.iter().enumerate() { + if phi.n() != n { + return Err(CohortError::Input(format!( + "kinship matrix '{}' has n={}, expected n={}", + phi.label(), + phi.n(), + n, + ))); + } + for &(j, m) in &pairs { + let label = format!("kins_{li}_{j}_{m}"); + let triplets = kronecker_block_triplets(n, k, j, m, phi); + out.push(KinshipMatrix::from_triplets(n * k, triplets, label)?); + } + } + + Ok(out) +} + +/// Build the `(nk × nk)` triplet list for `E_{j,m} ⊗ diag(d)` where +/// `d[i] = value(i)` for rows in `members`, else `0`. Used by the +/// residual-group branch. +fn build_block_triplets( + n: usize, + k: usize, + j: usize, + m: usize, + value: impl Fn(usize) -> Option, + members: &[u32], +) -> Vec> { + let _ = k; + let mut out: Vec> = + Vec::with_capacity(members.len() * if j == m { 1 } else { 2 }); + for &row in members { + let i = row as usize; + let v = match value(i) { + Some(v) if v != 0.0 => v, + _ => continue, + }; + // Block (j, m) sits at rows `j*n + i` and columns `m*n + i`. + out.push(Triplet::new( + (j * n + i) as u32, + (m * n + i) as u32, + v, + )); + if j != m { + out.push(Triplet::new( + (m * n + i) as u32, + (j * n + i) as u32, + v, + )); + } + } + out +} + +/// Build the `(nk × nk)` triplet list for `E_{j,m} ⊗ Φ`. Two Kronecker +/// blocks for `j != m` (at `(j,m)` and `(m,j)` in `E`), one for `j == m`. +fn kronecker_block_triplets( + n: usize, + k: usize, + j: usize, + m: usize, + phi: &KinshipMatrix, +) -> Vec> { + let _ = k; + let mut out: Vec> = Vec::new(); + let push_block = |out: &mut Vec>, jj: usize, mm: usize| { + match phi { + KinshipMatrix::Dense { matrix, .. } => { + for r in 0..n { + for c in 0..n { + let v = matrix[(r, c)]; + if v != 0.0 { + out.push(Triplet::new( + (jj * n + r) as u32, + (mm * n + c) as u32, + v, + )); + } + } + } + } + KinshipMatrix::Sparse { matrix, .. } => { + let col_ptr = matrix.symbolic().col_ptr(); + let row_idx = matrix.symbolic().row_idx(); + let val = matrix.val(); + for c in 0..n { + let s = col_ptr[c] as usize; + let e = col_ptr[c + 1] as usize; + for kk in s..e { + let r = row_idx[kk] as usize; + let v = val[kk]; + if v == 0.0 { + continue; + } + out.push(Triplet::new( + (jj * n + r) as u32, + (mm * n + c) as u32, + v, + )); + } + } + } + } + }; + push_block(&mut out, j, m); + if j != m { + push_block(&mut out, m, j); + } + out +} + +/// `vec(Y)` — column-major stack of an `(n × k)` phenotype matrix into +/// an `(nk × 1)` column. Trait 0 occupies rows `0..n`, trait 1 occupies +/// `n..2n`, and so on, matching `glmmkin.R:432` `Y2 <- as.vector(Y)`. +pub fn stack_response(y: &Mat) -> Mat { + let n = y.nrows(); + let k = y.ncols(); + let mut out = Mat::::zeros(n * k, 1); + for t in 0..k { + for i in 0..n { + out[(t * n + i, 0)] = y[(i, t)]; + } + } + out +} + +/// `I_k ⊗ X` — block-diagonal stacking of the shared covariate matrix +/// `X` `(n × p)` into `(nk × kp)`. Matches `glmmkin.R:434` +/// `X2 <- Diagonal(n=n.pheno) %x% X`. Dense storage is fine: `kp` is +/// small (≤ ~50 in practice) and the matrix is only formed once at fit +/// time. +pub fn stack_covariates(x: &Mat, n_pheno: usize) -> Mat { + let n = x.nrows(); + let p = x.ncols(); + let k = n_pheno; + let mut out = Mat::::zeros(n * k, k * p); + for t in 0..k { + for i in 0..n { + for c in 0..p { + out[(t * n + i, t * p + c)] = x[(i, c)]; + } + } + } + out +} + +/// Reshape a flat `Σ_{i=1}^{k(k+1)/2}` `τ` slice into a single symmetric +/// `(k × k)` matrix. Mirrors `glmmkin.R:216,222` which calls +/// `sparseMatrix(i=rep(1:k, k:1), j=flattened_cols, x=slice, symmetric=T)`. +fn reshape_sym_block(slice: &[f64], k: usize) -> Mat { + debug_assert_eq!(slice.len(), k * (k + 1) / 2); + let mut out = Mat::::zeros(k, k); + let mut idx = 0; + for j in 0..k { + for m in j..k { + out[(j, m)] = slice[idx]; + if j != m { + out[(m, j)] = slice[idx]; + } + idx += 1; + } + } + out +} + +/// Fitted state from multi-trait AI-REML with shared kinship matrices. +/// +/// Field layout mirrors the single-trait `KinshipState` but at the +/// stacked `(nk)` dimension. `theta_res` holds one `(k × k)` symmetric +/// residual-covariance matrix per residual group, `theta_g` holds one +/// `(k × k)` per kinship matrix. Several fields (`n_covariates`, +/// `n_iter`, and the reshaped `theta_res`/`theta_g` components) are +/// carried for downstream consumers and run-manifest reporting even if +/// the hot path does not read them; the `allow(dead_code)` keeps clippy +/// quiet without losing the public API. +#[derive(Clone)] +#[allow(dead_code)] +pub struct MultiKinshipState { + /// Stacked `Σ⁻¹` of shape `(nk × nk)`. + pub sigma_inv: KinshipInverse, + /// `Σ⁻¹ · (I_k ⊗ X)` of shape `(nk × kp)`. + pub sigma_inv_x: Mat, + /// `(X_stacked' · Σ⁻¹ · X_stacked)⁻¹` of shape `(kp × kp)`. + pub cov: Mat, + /// `PY_stacked = Σ⁻¹ · (vec(Y) − X_stacked · α̂)` of shape `(nk × 1)`. + pub p_y: Mat, + /// Per-group residual covariance matrices `Σ_res_g`, each `(k × k)`. + /// One entry even in the homoscedastic case. + pub theta_res: Vec>, + /// Per-kinship covariance matrices `Σ_g_l`, each `(k × k)`. One per + /// input kinship matrix, in input order. + pub theta_g: Vec>, + pub n_samples: usize, + pub n_pheno: usize, + pub n_covariates: usize, + pub n_iter: usize, +} + +/// Warm-start initialiser for the expanded `τ` vector. Mirrors +/// `glmmkin.R:446-447`: diagonal entries (`j == m`) get `var(vec(Y))/q` +/// where `q` is the expanded list length, off-diagonals start at `0`. +fn warm_start_tau(y_stacked: &Mat, n_pairs: usize, n_entries: usize, k: usize) -> VarianceComponents { + let n = y_stacked.nrows(); + let mean: f64 = (0..n).map(|i| y_stacked[(i, 0)]).sum::() / n as f64; + let var: f64 = (0..n) + .map(|i| (y_stacked[(i, 0)] - mean).powi(2)) + .sum::() + / (n - 1).max(1) as f64; + let warm = (var / n_entries as f64).max(1e-6); + let mut tau = VarianceComponents::zeros(n_entries, 0); + let pairs = sym_pairs(k); + let slot = tau.as_slice_mut(); + for block in 0..(n_entries / n_pairs) { + for (pair_idx, &(j, m)) in pairs.iter().enumerate() { + let flat = block * n_pairs + pair_idx; + slot[flat] = if j == m { warm } else { 0.0 }; + } + } + tau +} + +/// Multi-trait AI-REML with shared kinship matrices. Returns the +/// converged multi-trait null state. +/// +/// upstream: +/// `MultiSTAAR/R/fit_null_glmmkin_multi.R:70-123` → wrapper +/// `GMMAT/R/glmmkin.R:424-536` → `glmmkin.multi.ai` body +/// `GMMAT/R/glmmkin.R:771-811` → `R_fitglmm_ai_noresidual` AI iteration +pub fn fit_multi_kinship( + y: &Mat, + x: &Mat, + kinships: &[KinshipMatrix], + groups: &GroupPartition, +) -> Result { + let n = y.nrows(); + let k = y.ncols(); + let p = x.ncols(); + assert_eq!(x.nrows(), n, "X rows must match Y rows"); + assert!(n * k > p * k + 1, "need nk > kp samples for the joint fit"); + if !kinships.is_empty() && kinships[0].n() != n { + return Err(CohortError::Input(format!( + "kinship dim {} does not match phenotype dim {}", + kinships[0].n(), + n, + ))); + } + + let pairs = sym_pairs(k); + let n_pairs = pairs.len(); + let n_groups = groups.n_groups().max(1); + let n_entries = n_pairs * (n_groups + kinships.len()); + + let y_stacked = stack_response(y); + let x_stacked = stack_covariates(x, k); + let expanded = expand_kinships(kinships, groups, k)?; + debug_assert_eq!(expanded.len(), n_entries); + + let init_tau = warm_start_tau(&y_stacked, n_pairs, n_entries, k); + + // `run_reml` was written for the single-trait path and expects a + // backend-specific builder. For the dense-first multi-trait fit we + // route through the dense builder; sparse kinship → sparse stacked + // path is a follow-up (the expanded entries are naturally sparse for + // sparse Φ, but faer's sparse Cholesky on nk × nk needs more care + // around symbolic-factor reuse across iterations). + // + // For now we materialise the expanded entries as dense `KinshipMatrix` + // if any were originally dense, and keep the originally-sparse ones + // in sparse storage. The shared `ai_step` handles both variants via + // `matvec_kinship`, and `DenseBuilder` promotes sparse kins to dense + // Σ⁻¹ inside each iteration. + let expanded_kins_dense = promote_to_dense(&expanded, n * k); + let empty_groups = GroupPartition::empty(n * k); + let weights = vec![1.0; n * k]; + let builder = super::kinship::dense::DenseBuilder { + n: n * k, + kinships: &expanded_kins_dense, + groups: &empty_groups, + weights: &weights, + }; + let state: KinshipState = run_reml( + &y_stacked, + &x_stacked, + &expanded_kins_dense, + &empty_groups, + &weights, + init_tau, + &builder, + )?; + + // Reshape τ back to per-group / per-kinship (k × k) matrices. + let mut theta_res: Vec> = Vec::with_capacity(n_groups); + let mut theta_g: Vec> = Vec::with_capacity(kinships.len()); + let tau_flat = state.tau.as_slice(); + for gi in 0..n_groups { + let offset = gi * n_pairs; + theta_res.push(reshape_sym_block(&tau_flat[offset..offset + n_pairs], k)); + } + for li in 0..kinships.len() { + let offset = (n_groups + li) * n_pairs; + theta_g.push(reshape_sym_block(&tau_flat[offset..offset + n_pairs], k)); + } + + Ok(MultiKinshipState { + sigma_inv: state.inverse, + sigma_inv_x: state.sigma_inv_x, + cov: state.cov, + p_y: state.p_y, + theta_res, + theta_g, + n_samples: n, + n_pheno: k, + n_covariates: p, + n_iter: state.n_iter, + }) +} + +/// Per-gene score statistics for the multi-trait kinship path. +/// +/// `s[j, t] = (G₀' · PY_reshaped)[j, t]` is the per-trait score at +/// variant `j`, same shape and semantics as the no-kinship +/// `GeneStats.s`. `cov_full` is the full `(mk × mk)` joint score +/// covariance — the kinship term prevents the Kronecker factorisation +/// the no-kinship path relies on, so the per-test kernels contract +/// against this matrix directly. +/// +/// Layout of `cov_full`: trait-major block order. The entry at +/// `(t1 * m + j1, t2 * m + j2)` is the covariance between the score at +/// trait `t1`, variant `j1` and trait `t2`, variant `j2`. When the +/// no-kinship path uses `Σ_res⁻¹ ⊗ K₀`, this matrix would equal +/// `Σ_res ⊗ K₀` exactly (the Kronecker expansion), which is the +/// invariant the k=1 gene-level parity test pins. +pub struct MultiKinshipGeneStats { + pub s: Mat, + pub cov_full: Mat, +} + +/// Build the per-gene `(S, cov_full)` for the multi-trait kinship path. +/// See `MultiKinshipGeneStats` for layout. +/// +/// upstream: `MultiSTAAR/src/MultiSTAAR_O_SMMAT_sparse.cpp:77` for the +/// sparse-Σ assembly (we use the dense-Σ analogue — same formula). +pub fn gene_stats_kinship(g0: &Mat, null: &MultiKinshipState) -> MultiKinshipGeneStats { + let n = null.n_samples; + let k = null.n_pheno; + let m = g0.ncols(); + assert_eq!(g0.nrows(), n, "G₀ rows must match null n_samples"); + let mk = m * k; + + // S[j, t] = Σ_i G₀[i, j] · PY[t*n + i, 0]. + let mut s = Mat::::zeros(m, k); + for t in 0..k { + for j in 0..m { + let mut acc = 0.0; + for i in 0..n { + acc += g0[(i, j)] * null.p_y[(t * n + i, 0)]; + } + s[(j, t)] = acc; + } + } + + // Materialise G_stacked = I_k ⊗ G₀ as (nk × mk). For small gene-set + // sizes this is cheap; a block-wise implementation that avoids the + // allocation is a follow-up if production cohorts hit it. + let mut g_stacked = Mat::::zeros(n * k, mk); + for t in 0..k { + for i in 0..n { + for j in 0..m { + g_stacked[(t * n + i, t * m + j)] = g0[(i, j)]; + } + } + } + + // Σ⁻¹ · G_stacked — dense solve or matmul depending on the stored + // inverse representation. + let sinv_g = match &null.sigma_inv { + KinshipInverse::Dense(sinv) => sinv * &g_stacked, + KinshipInverse::Sparse(factor) => { + use faer::linalg::solvers::Solve; + let mut out = g_stacked.clone(); + factor.llt.solve_in_place(&mut out); + out + } + }; + + let term1 = g_stacked.transpose() * &sinv_g; + let xs_sinv_g = null.sigma_inv_x.transpose() * &g_stacked; + let cov_correction = xs_sinv_g.transpose() * (&null.cov * &xs_sinv_g); + let mut cov_full = term1; + for r in 0..mk { + for c in 0..mk { + cov_full[(r, c)] -= cov_correction[(r, c)]; + } + } + + MultiKinshipGeneStats { s, cov_full } +} + +/// Joint per-variant χ²(k) test at variant `i` using the full `(mk × +/// mk)` covariance. Extracts the `(k × k)` diagonal block at +/// `(t1 * m + i, t2 * m + i)`, then evaluates +/// `Q_i = U_i' · Cov_i⁻¹ · U_i ~ χ²(k)` with `U_i = S[i, :]`. +/// +/// Returns `1.0` if the per-variant variance block is numerically +/// singular or non-positive-definite; matches the fail-safe semantics +/// of `multi::variant_joint_chi2`. +pub fn variant_joint_chi2_kinship( + s: &Mat, + cov_full: &Mat, + i: usize, + n_pheno: usize, + m: usize, +) -> f64 { + let k = n_pheno; + // Extract (k × k) diagonal block at variant i. + let mut block = Mat::::zeros(k, k); + for t1 in 0..k { + for t2 in 0..k { + block[(t1, t2)] = cov_full[(t1 * m + i, t2 * m + i)]; + } + } + // Diagonal-zero guard: if any trait's variance collapses the joint + // test is undefined; return 1.0 to mirror `multi::variant_joint_chi2`. + for a in 0..k { + if !block[(a, a)].is_finite() || block[(a, a)] <= 0.0 { + return 1.0; + } + } + let eye_k = Mat::::identity(k, k); + let block_inv: Mat = block.col_piv_qr().solve(&eye_k); + let mut quad = 0.0; + for a in 0..k { + for b in 0..k { + quad += s[(i, a)] * block_inv[(a, b)] * s[(i, b)]; + } + } + if !quad.is_finite() || quad <= 0.0 { + return 1.0; + } + chisq_pvalue(quad, k as f64) +} + +/// Joint burden test with weight `w`. Per-trait burden score +/// `U_burden[t] = Σ_j w[j] · S[j, t]`, variance block +/// `V[t1, t2] = Σ_{j1, j2} w[j1] · cov_full[t1m+j1, t2m+j2] · w[j2]`, +/// then `Q = U_burden' V⁻¹ U_burden ~ χ²(k)`. +pub fn multi_burden_kinship( + s: &Mat, + cov_full: &Mat, + w: &[f64], + n_pheno: usize, +) -> f64 { + let k = n_pheno; + let m = w.len(); + if m == 0 { + return 1.0; + } + let mut u_burden = vec![0.0_f64; k]; + for t in 0..k { + let mut acc = 0.0; + for j in 0..m { + acc += w[j] * s[(j, t)]; + } + u_burden[t] = acc; + } + + // V[t1, t2] = Σ_{j1, j2} w[j1] · cov_full[t1m+j1, t2m+j2] · w[j2]. + let mut v = Mat::::zeros(k, k); + for t1 in 0..k { + for t2 in 0..k { + let mut acc = 0.0; + for j1 in 0..m { + if w[j1] == 0.0 { + continue; + } + for j2 in 0..m { + if w[j2] == 0.0 { + continue; + } + acc += w[j1] * cov_full[(t1 * m + j1, t2 * m + j2)] * w[j2]; + } + } + v[(t1, t2)] = acc; + } + } + + let eye_k = Mat::::identity(k, k); + let v_inv = v.col_piv_qr().solve(&eye_k); + let mut q = 0.0; + for a in 0..k { + for b in 0..k { + q += u_burden[a] * v_inv[(a, b)] * u_burden[b]; + } + } + if !q.is_finite() || q <= 0.0 { + return 1.0; + } + chisq_pvalue(q, k as f64) +} + +/// Joint SKAT with weight `w`. +/// +/// `Q = vec(WS)' (I_k ⊗ W) vec(S) = Σ_{t, j} w[j]² · S[j, t]²` +/// +/// Under the null `vec(S) ~ N(0, cov_full)`, so `Q` is a weighted +/// mixture of χ²(1) with eigenvalues of +/// `W^(1/2)_stacked · cov_full · W^(1/2)_stacked` where +/// `W_stacked = I_k ⊗ diag(w²)`. At `k=1` the construction reduces to +/// `diag(w²) · K · diag(w²) → eigvals = eigvals(W K W)`, which matches +/// single-trait SKAT. +pub fn multi_skat_kinship( + s: &Mat, + cov_full: &Mat, + w: &[f64], + n_pheno: usize, +) -> f64 { + let k = n_pheno; + let m = w.len(); + if m == 0 { + return 1.0; + } + // Q = Σ_{t, j} w[j]² · S[j, t]². + let mut q = 0.0; + for t in 0..k { + for j in 0..m { + let wj = w[j]; + if wj == 0.0 { + continue; + } + let v = wj * s[(j, t)]; + q += v * v; + } + } + if !q.is_finite() || q <= 0.0 { + return 1.0; + } + + // Weighted kernel `A = W_stacked · cov_full · W_stacked` with + // `W_stacked = I_k ⊗ diag(w)`. (Using w rather than √(w²) keeps + // signs consistent with the quadratic form above; eigenvalues of + // A are the mixture weights for `Q`.) + let mk = m * k; + let mut a = Mat::::zeros(mk, mk); + for t1 in 0..k { + for j1 in 0..m { + let wi = w[j1]; + if wi == 0.0 { + continue; + } + for t2 in 0..k { + for j2 in 0..m { + let wj = w[j2]; + if wj == 0.0 { + continue; + } + a[(t1 * m + j1, t2 * m + j2)] = + wi * cov_full[(t1 * m + j1, t2 * m + j2)] * wj; + } + } + } + } + + let eigs = symmetric_eigenvalues(&a); + crate::staar::stats::mixture_chisq_pvalue(q, &eigs) +} + +fn symmetric_eigenvalues(m: &Mat) -> Vec { + let n = m.nrows(); + if n == 0 { + return Vec::new(); + } + match m.self_adjoint_eigen(faer::Side::Lower) { + Ok(evd) => { + let s = evd.S(); + let col = s.column_vector(); + (0..n).map(|i| col[i].max(0.0)).collect() + } + Err(_) => vec![0.0; n], + } +} + +/// Joint ACAT-V: per-variant common-variant joint χ²(k), plus a pooled +/// rare-group joint burden; Cauchy-combined with `w_acat` weights. +/// Matches the no-kinship `multi::multi_acat_v` MAC split (MAC > 10 ⇒ +/// common; MAC ≤ 10 ⇒ pooled burden with `w_burden` weights). +#[allow(clippy::too_many_arguments)] +pub fn multi_acat_v_kinship( + s: &Mat, + cov_full: &Mat, + n_pheno: usize, + w_acat: &[f64], + w_burden: &[f64], + mafs: &[f64], + n_samples: usize, +) -> f64 { + const MAC_THRESHOLD: f64 = 10.0; + let k = n_pheno; + let m = w_acat.len(); + let ns = n_samples as f64; + + let mut p_values: Vec = Vec::with_capacity(m); + let mut cauchy_weights: Vec = Vec::with_capacity(m); + let mut rare_idx: Vec = Vec::new(); + + for j in 0..m { + if w_acat[j] == 0.0 { + continue; + } + let mac = (2.0 * mafs[j] * ns).round(); + if mac > MAC_THRESHOLD { + let p = variant_joint_chi2_kinship(s, cov_full, j, k, m); + p_values.push(p); + cauchy_weights.push(w_acat[j]); + } else { + rare_idx.push(j); + } + } + + if !rare_idx.is_empty() { + let mut w_rare = vec![0.0_f64; m]; + for &j in &rare_idx { + w_rare[j] = w_burden[j]; + } + let p = multi_burden_kinship(s, cov_full, &w_rare, k); + let mean_w = + rare_idx.iter().map(|&j| w_acat[j]).sum::() / rare_idx.len() as f64; + p_values.push(p); + cauchy_weights.push(mean_w); + } + + if p_values.is_empty() { + return 1.0; + } + crate::staar::stats::cauchy_combine_weighted(&p_values, &cauchy_weights) +} + +fn chisq_pvalue(t: f64, df: f64) -> f64 { + use statrs::distribution::{ChiSquared, ContinuousCDF}; + if t <= 0.0 || !t.is_finite() { + return 1.0; + } + match ChiSquared::new(df) { + Ok(d) => (1.0 - d.cdf(t)).max(crate::staar::stats::P_FLOOR), + Err(_) => f64::NAN, + } +} + +/// Run the multi-trait joint STAAR omnibus on one gene against a +/// kinship-aware null. Mirrors `multi::run_multi_staar` test-for-test +/// but contracts against the full `(mk × mk)` covariance so the results +/// stay valid under arbitrary kinship correlation. +pub fn run_multi_staar_kinship( + g0: &Mat, + null: &MultiKinshipState, + annotation_matrix: &[Vec], + mafs: &[f64], +) -> crate::staar::score::StaarResult { + use crate::staar::score::{beta_density_weight, StaarResult}; + use crate::staar::stats; + let m = g0.ncols(); + if m == 0 { + return nan_result(); + } + let gs = gene_stats_kinship(g0, null); + let s = &gs.s; + let cov_full = &gs.cov_full; + let k = null.n_pheno; + let n_samples = null.n_samples; + + let beta_1_25: Vec = + mafs.iter().map(|&maf| beta_density_weight(maf, 1.0, 25.0)).collect(); + let beta_1_1: Vec = + mafs.iter().map(|&maf| beta_density_weight(maf, 1.0, 1.0)).collect(); + let acat_denom: Vec = mafs + .iter() + .map(|&maf| { + let d = beta_density_weight(maf, 0.5, 0.5); + if d > 0.0 { d * d } else { 1.0 } + }) + .collect(); + + let base_burden_1_25 = multi_burden_kinship(s, cov_full, &beta_1_25, k); + let base_burden_1_1 = multi_burden_kinship(s, cov_full, &beta_1_1, k); + let base_skat_1_25 = multi_skat_kinship(s, cov_full, &beta_1_25, k); + let base_skat_1_1 = multi_skat_kinship(s, cov_full, &beta_1_1, k); + let wa_base_1_25: Vec = + beta_1_25.iter().zip(&acat_denom).map(|(b, d)| b * b / d).collect(); + let wa_base_1_1: Vec = + beta_1_1.iter().zip(&acat_denom).map(|(b, d)| b * b / d).collect(); + let base_acat_v_1_25 = multi_acat_v_kinship( + s, cov_full, k, &wa_base_1_25, &beta_1_25, mafs, n_samples, + ); + let base_acat_v_1_1 = multi_acat_v_kinship( + s, cov_full, k, &wa_base_1_1, &beta_1_1, mafs, n_samples, + ); + + let acat_o = stats::cauchy_combine(&[ + base_burden_1_25, + base_burden_1_1, + base_skat_1_25, + base_skat_1_1, + base_acat_v_1_25, + base_acat_v_1_1, + ]); + + let n_channels = annotation_matrix.len(); + let mut per_annotation: Vec<[f64; 6]> = Vec::with_capacity(n_channels); + let mut by_test: [Vec; 6] = [ + vec![base_burden_1_25], + vec![base_burden_1_1], + vec![base_skat_1_25], + vec![base_skat_1_1], + vec![base_acat_v_1_25], + vec![base_acat_v_1_1], + ]; + + let mut wb_1_25 = vec![0.0; m]; + let mut wb_1_1 = vec![0.0; m]; + let mut ws_1_25 = vec![0.0; m]; + let mut ws_1_1 = vec![0.0; m]; + let mut wa_1_25 = vec![0.0; m]; + let mut wa_1_1 = vec![0.0; m]; + + for channel_weights in annotation_matrix { + for j in 0..m { + let a = channel_weights[j]; + let a_sqrt = a.sqrt(); + wb_1_25[j] = beta_1_25[j] * a; + wb_1_1[j] = beta_1_1[j] * a; + ws_1_25[j] = beta_1_25[j] * a_sqrt; + ws_1_1[j] = beta_1_1[j] * a_sqrt; + wa_1_25[j] = a * beta_1_25[j] * beta_1_25[j] / acat_denom[j]; + wa_1_1[j] = a * beta_1_1[j] * beta_1_1[j] / acat_denom[j]; + } + let p = [ + multi_burden_kinship(s, cov_full, &wb_1_25, k), + multi_burden_kinship(s, cov_full, &wb_1_1, k), + multi_skat_kinship(s, cov_full, &ws_1_25, k), + multi_skat_kinship(s, cov_full, &ws_1_1, k), + multi_acat_v_kinship(s, cov_full, k, &wa_1_25, &wb_1_25, mafs, n_samples), + multi_acat_v_kinship(s, cov_full, k, &wa_1_1, &wb_1_1, mafs, n_samples), + ]; + for i in 0..6 { + by_test[i].push(p[i]); + } + per_annotation.push(p); + } + + let staar_b_1_25 = stats::cauchy_combine(&by_test[0]); + let staar_b_1_1 = stats::cauchy_combine(&by_test[1]); + let staar_s_1_25 = stats::cauchy_combine(&by_test[2]); + let staar_s_1_1 = stats::cauchy_combine(&by_test[3]); + let staar_a_1_25 = stats::cauchy_combine(&by_test[4]); + let staar_a_1_1 = stats::cauchy_combine(&by_test[5]); + + let mut all_p: Vec = Vec::with_capacity(6 + n_channels * 6); + all_p.extend_from_slice(&[ + base_burden_1_25, + base_burden_1_1, + base_skat_1_25, + base_skat_1_1, + base_acat_v_1_25, + base_acat_v_1_1, + ]); + for p in &per_annotation { + all_p.extend_from_slice(p); + } + let staar_o = stats::cauchy_combine(&all_p); + + StaarResult { + burden_1_25: base_burden_1_25, + burden_1_1: base_burden_1_1, + skat_1_25: base_skat_1_25, + skat_1_1: base_skat_1_1, + acat_v_1_25: base_acat_v_1_25, + acat_v_1_1: base_acat_v_1_1, + per_annotation, + staar_b_1_25, + staar_b_1_1, + staar_s_1_25, + staar_s_1_1, + staar_a_1_25, + staar_a_1_1, + acat_o, + staar_o, + } +} + +fn nan_result() -> crate::staar::score::StaarResult { + crate::staar::score::StaarResult { + burden_1_25: f64::NAN, + burden_1_1: f64::NAN, + skat_1_25: f64::NAN, + skat_1_1: f64::NAN, + acat_v_1_25: f64::NAN, + acat_v_1_1: f64::NAN, + per_annotation: Vec::new(), + staar_b_1_25: f64::NAN, + staar_b_1_1: f64::NAN, + staar_s_1_25: f64::NAN, + staar_s_1_1: f64::NAN, + staar_a_1_25: f64::NAN, + staar_a_1_1: f64::NAN, + acat_o: f64::NAN, + staar_o: f64::NAN, + } +} + +/// Promote every sparse entry in an expanded list to a dense +/// `KinshipMatrix` so the dense REML builder can operate uniformly. For +/// the dense-first Phase B this is the simplest correct path; the sparse +/// Σ route follows once we validate the math end-to-end. +/// +/// The expanded entries are extremely sparse (single-block Kronecker), +/// so `KinshipMatrix::new` would route them back to sparse storage via +/// its density threshold. We bypass that by constructing the `Dense` +/// variant directly — the dense backend requires dense storage +/// regardless of density. +fn promote_to_dense(expanded: &[KinshipMatrix], n: usize) -> Vec { + let mut out = Vec::with_capacity(expanded.len()); + for kin in expanded { + match kin { + KinshipMatrix::Dense { .. } => out.push(kin.clone()), + KinshipMatrix::Sparse { matrix, label } => { + let mut dense = Mat::::zeros(n, n); + let col_ptr = matrix.symbolic().col_ptr(); + let row_idx = matrix.symbolic().row_idx(); + let val = matrix.val(); + for c in 0..n { + let s = col_ptr[c] as usize; + let e = col_ptr[c + 1] as usize; + for kk in s..e { + let r = row_idx[kk] as usize; + dense[(r, c)] += val[kk]; + } + } + out.push(KinshipMatrix::Dense { + matrix: dense, + label: label.clone(), + }); + } + } + } + out +} + +#[cfg(test)] +mod tests { + use super::*; + + fn identity_kinship(n: usize, label: &str) -> KinshipMatrix { + let mut m = Mat::::zeros(n, n); + for i in 0..n { + m[(i, i)] = 1.0; + } + KinshipMatrix::new(m, label.into()).unwrap() + } + + fn pedigree_kinship(n_fam: usize, label: &str) -> KinshipMatrix { + // 2-sib families: 0.5 within-family off-diagonal, 1.0 diagonal. + let n = n_fam * 2; + let mut m = Mat::::zeros(n, n); + for f in 0..n_fam { + let a = 2 * f; + let b = 2 * f + 1; + m[(a, a)] = 1.0; + m[(b, b)] = 1.0; + m[(a, b)] = 0.5; + m[(b, a)] = 0.5; + } + KinshipMatrix::new(m, label.into()).unwrap() + } + + #[test] + fn sym_pairs_enumerate_upper_triangle() { + assert_eq!(sym_pairs(1), vec![(0, 0)]); + assert_eq!(sym_pairs(2), vec![(0, 0), (0, 1), (1, 1)]); + assert_eq!(sym_pairs(3), vec![(0, 0), (0, 1), (0, 2), (1, 1), (1, 2), (2, 2)]); + } + + #[test] + fn stack_response_is_column_major() { + let n = 3; + let k = 2; + let mut y = Mat::::zeros(n, k); + for i in 0..n { + for t in 0..k { + y[(i, t)] = (10 * t + i) as f64; + } + } + let y2 = stack_response(&y); + assert_eq!(y2.nrows(), n * k); + assert_eq!(y2.ncols(), 1); + assert_eq!(y2[(0, 0)], 0.0); + assert_eq!(y2[(1, 0)], 1.0); + assert_eq!(y2[(2, 0)], 2.0); + assert_eq!(y2[(3, 0)], 10.0); + assert_eq!(y2[(4, 0)], 11.0); + assert_eq!(y2[(5, 0)], 12.0); + } + + #[test] + fn stack_covariates_is_block_diagonal() { + let n = 2; + let p = 2; + let k = 3; + let mut x = Mat::::zeros(n, p); + x[(0, 0)] = 1.0; + x[(0, 1)] = 2.0; + x[(1, 0)] = 3.0; + x[(1, 1)] = 4.0; + let x2 = stack_covariates(&x, k); + assert_eq!(x2.nrows(), n * k); + assert_eq!(x2.ncols(), k * p); + + // Block (0, 0): rows 0..n, cols 0..p carries original X. + assert_eq!(x2[(0, 0)], 1.0); + assert_eq!(x2[(1, 1)], 4.0); + // Block (1, 1): rows n..2n, cols p..2p carries original X. + assert_eq!(x2[(2, 2)], 1.0); + assert_eq!(x2[(3, 3)], 4.0); + // Off-diagonal blocks are zero. + assert_eq!(x2[(0, 2)], 0.0); + assert_eq!(x2[(2, 0)], 0.0); + } + + #[test] + fn expand_kinships_entry_count_matches_upstream_formula() { + let n = 6; + let k = 2; + let phi = identity_kinship(n, "phi"); + let groups = GroupPartition::single(n); + let expanded = expand_kinships(std::slice::from_ref(&phi), &groups, k).unwrap(); + // n_params = k(k+1)/2 · (G + L) = 3 · (1 + 1) = 6 + assert_eq!(expanded.len(), 6); + for e in &expanded { + assert_eq!(e.n(), n * k); + } + } + + #[test] + fn reshape_sym_block_is_symmetric_and_orders_upper_triangle() { + let flat = [1.0, 2.0, 3.0]; // (0,0), (0,1), (1,1) + let m = reshape_sym_block(&flat, 2); + assert_eq!(m[(0, 0)], 1.0); + assert_eq!(m[(0, 1)], 2.0); + assert_eq!(m[(1, 0)], 2.0); + assert_eq!(m[(1, 1)], 3.0); + } + + /// k=1 parity: `fit_multi_kinship` with one trait must produce the + /// same Σ⁻¹, Σ⁻¹X, cov, and PY as single-trait `fit_reml_dense` on + /// the same inputs. At k=1 the expansion collapses to `E_{0,0} ⊗ V = + /// V`, stacking is the identity, and the AI-REML runs on identical + /// inputs; the fits should converge to the same estimates. + #[test] + fn k1_kinship_matches_single_trait_reml() { + use crate::staar::kinship::fit_reml; + + let n_fam = 6; + let n = n_fam * 2; + // Use a dense kinship so both single-trait and multi paths route + // through the dense backend — the k=1 parity assertion is about + // the math, not the dense/sparse dispatch. + let mut phi_dense = Mat::::zeros(n, n); + for i in 0..n { + phi_dense[(i, i)] = 1.0; + } + for f in 0..n_fam { + let a = 2 * f; + let b = 2 * f + 1; + phi_dense[(a, b)] = 0.5; + phi_dense[(b, a)] = 0.5; + } + // Construct Dense variant directly so the density-based router + // in `KinshipMatrix::new` does not re-promote to sparse. + let phi = KinshipMatrix::Dense { + matrix: phi_dense, + label: "phi_dense".into(), + }; + + // Deterministic single-trait phenotype. + let mut state: u64 = 4242; + let mut u01 = || { + state ^= state << 13; + state ^= state >> 7; + state ^= state << 17; + (state >> 11) as f64 / (1u64 << 53) as f64 + }; + let mut y_n1 = Mat::::zeros(n, 1); + let mut x = Mat::::zeros(n, 2); + for i in 0..n { + y_n1[(i, 0)] = 0.5 + 2.0 * u01() - 1.0; + x[(i, 0)] = 1.0; + x[(i, 1)] = u01(); + } + let groups = GroupPartition::single(n); + + // Single-trait REML via the top-level dispatcher (it picks the + // dense backend automatically for our dense Φ at n=12). + let single = fit_reml( + &y_n1, + &x, + std::slice::from_ref(&phi), + &groups, + None, + 1 << 30, + ) + .expect("single-trait REML must converge on this well-conditioned pedigree"); + + // Multi-trait path with k=1. + let multi = fit_multi_kinship(&y_n1, &x, std::slice::from_ref(&phi), &groups) + .expect("multi k=1 must converge"); + + assert_eq!(multi.n_pheno, 1); + assert_eq!(multi.theta_res.len(), 1); + assert_eq!(multi.theta_g.len(), 1); + + // Theta at k=1: theta_res[0] is a 1x1 with the residual σ²; + // theta_g[0] is a 1x1 with τ. Match against single-trait tau + // within AI-REML tolerance (convergence criterion uses REML_TOL + // on both sides, so we allow a loose 1e-4 band). + let single_sigma2 = single.tau.group(0); + let single_tau = single.tau.kinship(0); + let multi_sigma2 = multi.theta_res[0][(0, 0)]; + let multi_tau = multi.theta_g[0][(0, 0)]; + assert!( + (single_sigma2 - multi_sigma2).abs() < 1e-4, + "σ²_res: single={single_sigma2} multi={multi_sigma2}", + ); + assert!( + (single_tau - multi_tau).abs() < 1e-4, + "τ_phi: single={single_tau} multi={multi_tau}", + ); + + // PY at k=1 is the stacked response projected through Σ⁻¹; the + // stack is a no-op at k=1 so the numbers must match element-wise. + for i in 0..n { + let s = single.p_y[(i, 0)]; + let m = multi.p_y[(i, 0)]; + assert!((s - m).abs() < 1e-4, "PY[{i}] single={s} multi={m}"); + } + } + + /// k=1 gene stats parity: the full `(m × m)` `cov_full` must equal + /// the single-trait kinship-aware kernel + /// `K = G₀' Σ⁻¹ G₀ − (G₀' Σ⁻¹ X) · cov · (X' Σ⁻¹ G₀)` computed + /// directly from the single-trait `KinshipState`. + #[test] + fn k1_gene_stats_matches_single_trait_kernel() { + use crate::staar::kinship::fit_reml; + + let n_fam = 6; + let n = n_fam * 2; + let mut phi_dense = Mat::::zeros(n, n); + for i in 0..n { + phi_dense[(i, i)] = 1.0; + } + for f in 0..n_fam { + let a = 2 * f; + let b = 2 * f + 1; + phi_dense[(a, b)] = 0.5; + phi_dense[(b, a)] = 0.5; + } + let phi = KinshipMatrix::Dense { + matrix: phi_dense, + label: "phi_dense".into(), + }; + + let mut state: u64 = 777; + let mut u01 = || { + state ^= state << 13; + state ^= state >> 7; + state ^= state << 17; + (state >> 11) as f64 / (1u64 << 53) as f64 + }; + let mut y_n1 = Mat::::zeros(n, 1); + let mut x = Mat::::zeros(n, 2); + let m = 4; + let mut g0 = Mat::::zeros(n, m); + for i in 0..n { + y_n1[(i, 0)] = 2.0 * u01() - 1.0; + x[(i, 0)] = 1.0; + x[(i, 1)] = u01(); + for j in 0..m { + g0[(i, j)] = if u01() < 0.2 { 1.0 } else { 0.0 }; + } + } + let groups = GroupPartition::single(n); + + let single = fit_reml( + &y_n1, + &x, + std::slice::from_ref(&phi), + &groups, + None, + 1 << 30, + ) + .expect("single-trait REML convergence"); + let multi = fit_multi_kinship(&y_n1, &x, std::slice::from_ref(&phi), &groups) + .expect("multi k=1 convergence"); + + // Single-trait reference: u = G₀' PY, K = G₀' Σ⁻¹ G₀ − cross. + let sinv_dense = match &single.inverse { + KinshipInverse::Dense(m) => m, + _ => panic!("expected dense single-trait inverse at n=12 dense Φ"), + }; + let u_ref = g0.transpose() * &single.p_y; + let sinv_g = sinv_dense * &g0; + let term1 = g0.transpose() * &sinv_g; + let xs_sinv_g = single.sigma_inv_x.transpose() * &g0; + let cov_corr = xs_sinv_g.transpose() * (&single.cov * &xs_sinv_g); + let mut k_ref = term1.clone(); + for i in 0..m { + for j in 0..m { + k_ref[(i, j)] -= cov_corr[(i, j)]; + } + } + + let gs = gene_stats_kinship(&g0, &multi); + assert_eq!(gs.s.nrows(), m); + assert_eq!(gs.s.ncols(), 1); + assert_eq!(gs.cov_full.nrows(), m); + assert_eq!(gs.cov_full.ncols(), m); + + for j in 0..m { + assert!( + (gs.s[(j, 0)] - u_ref[(j, 0)]).abs() < 1e-6, + "S[{j}] single={} multi={}", + u_ref[(j, 0)], + gs.s[(j, 0)], + ); + for l in 0..m { + assert!( + (gs.cov_full[(j, l)] - k_ref[(j, l)]).abs() < 1e-6, + "K[{j},{l}] single={} multi={}", + k_ref[(j, l)], + gs.cov_full[(j, l)], + ); + } + } + } + + /// k=2 omnibus smoke test: the kinship omnibus runs end-to-end and + /// produces a finite p in (0, 1]. This is not a parity test — it + /// confirms the full-cov per-test functions compose without + /// numerical blow-ups on a well-conditioned input. + #[test] + fn k2_kinship_omnibus_runs_and_is_valid_p() { + let n_fam = 8; + let n = n_fam * 2; + let mut phi_dense = Mat::::zeros(n, n); + for i in 0..n { + phi_dense[(i, i)] = 1.0; + } + for f in 0..n_fam { + let a = 2 * f; + let b = 2 * f + 1; + phi_dense[(a, b)] = 0.5; + phi_dense[(b, a)] = 0.5; + } + let phi = KinshipMatrix::Dense { + matrix: phi_dense, + label: "phi_dense".into(), + }; + + let mut state: u64 = 91011; + let mut u01 = || { + state ^= state << 13; + state ^= state >> 7; + state ^= state << 17; + (state >> 11) as f64 / (1u64 << 53) as f64 + }; + let mut y = Mat::::zeros(n, 2); + let mut x = Mat::::zeros(n, 2); + let m = 5; + let mafs = vec![0.04, 0.08, 0.03, 0.06, 0.02]; + let mut g0 = Mat::::zeros(n, m); + for i in 0..n { + let e1 = 2.0 * u01() - 1.0; + let e2 = 2.0 * u01() - 1.0; + y[(i, 0)] = 0.5 + e1; + y[(i, 1)] = 0.3 + 0.4 * e1 + 0.9 * e2; + x[(i, 0)] = 1.0; + x[(i, 1)] = u01(); + for j in 0..m { + g0[(i, j)] = if u01() < mafs[j] { + if u01() < mafs[j] { 2.0 } else { 1.0 } + } else { + 0.0 + }; + } + } + let groups = GroupPartition::single(n); + let null = fit_multi_kinship(&y, &x, std::slice::from_ref(&phi), &groups) + .expect("multi k=2 should converge"); + let ann: Vec> = (0..2) + .map(|c| (0..m).map(|j| 0.5 + 0.1 * (c + j) as f64).collect()) + .collect(); + let result = run_multi_staar_kinship(&g0, &null, &ann, &mafs); + assert!(result.staar_o.is_finite(), "STAAR-O not finite: {}", result.staar_o); + assert!( + (0.0..=1.0).contains(&result.staar_o), + "STAAR-O out of range: {}", + result.staar_o, + ); + assert!(result.acat_o.is_finite()); + assert!(result.burden_1_25.is_finite()); + assert!(result.skat_1_25.is_finite()); + } + + /// Tiny-n smoke test: fit k=2 with a small pedigree kinship; assert + /// the expansion + AI-REML pipeline runs end-to-end without error + /// and returns plausibly-shaped state. + #[test] + fn fit_multi_kinship_smoke_n12_k2() { + let n_fam = 6; // 12 samples total + let n = n_fam * 2; + let phi = pedigree_kinship(n_fam, "phi"); + + // Simple normal phenotype, two correlated traits. + let mut y = Mat::::zeros(n, 2); + let mut x = Mat::::zeros(n, 2); + // Deterministic xorshift so the test doesn't depend on thread-RNG. + let mut state: u64 = 123; + let mut u01 = || { + state ^= state << 13; + state ^= state >> 7; + state ^= state << 17; + (state >> 11) as f64 / (1u64 << 53) as f64 + }; + for i in 0..n { + let e1 = 2.0 * u01() - 1.0; + let e2 = 2.0 * u01() - 1.0; + y[(i, 0)] = 0.5 + e1; + y[(i, 1)] = 0.3 + 0.4 * e1 + 0.9 * e2; + x[(i, 0)] = 1.0; + x[(i, 1)] = u01(); + } + let groups = GroupPartition::single(n); + let fit = fit_multi_kinship(&y, &x, std::slice::from_ref(&phi), &groups) + .expect("fit_multi_kinship should converge on well-conditioned inputs"); + assert_eq!(fit.n_samples, n); + assert_eq!(fit.n_pheno, 2); + assert_eq!(fit.theta_res.len(), 1); + assert_eq!(fit.theta_g.len(), 1); + // Residual covariance must be PSD (diagonal entries positive). + assert!(fit.theta_res[0][(0, 0)] >= 0.0); + assert!(fit.theta_res[0][(1, 1)] >= 0.0); + assert_eq!(fit.sigma_inv_x.nrows(), n * 2); + assert_eq!(fit.cov.nrows(), 2 * 2); + assert_eq!(fit.p_y.nrows(), n * 2); + } +} diff --git a/src/staar/output.rs b/src/staar/output.rs index c3343e4..563cb91 100644 --- a/src/staar/output.rs +++ b/src/staar/output.rs @@ -23,7 +23,7 @@ use crate::column::{Col, STAAR_PHRED_CHANNELS}; use crate::error::CohortError; use crate::output::Output; use crate::store::manifest::{fsync_parent, tmp_path, write_atomic}; -use super::multi::MultiNullContinuous; +use super::multi::MultiNull; use super::{GeneResult, MaskType, TraitType}; /// Null-model summary fields embedded in `staar.meta.json`. Single-trait @@ -32,7 +32,7 @@ use super::{GeneResult, MaskType, TraitType}; /// since there is no single `σ²` scalar in that parametrisation. pub enum NullMeta<'a> { Single { sigma2: f64 }, - Multi(&'a MultiNullContinuous), + Multi(&'a MultiNull), } impl<'a> NullMeta<'a> { diff --git a/src/staar/pipeline.rs b/src/staar/pipeline.rs index ea2ada3..4e24fc1 100644 --- a/src/staar/pipeline.rs +++ b/src/staar/pipeline.rs @@ -13,7 +13,7 @@ use crate::runtime::Engine; use crate::staar::carrier::AnalysisVectors; use crate::staar::masks::ScangParams; use crate::staar::model::{augment_covariates, load_known_loci, load_phenotype, NullModel}; -use crate::staar::multi::MultiNullContinuous; +use crate::staar::multi::MultiNull; use crate::staar::output::{write_individual_results, write_results, NullMeta}; use crate::staar::run_manifest::{ self, ArtifactKind, CacheDecision, CacheOutcome, ConfigHashInputs, ResumeDecision, @@ -54,6 +54,11 @@ pub struct StaarConfig { pub scang_params: ScangParams, pub kinship: Vec, pub kinship_groups: Option, + /// Per-sample time values column for the longitudinal random-slope + /// LMM. `Some(col)` selects + /// [`crate::staar::kinship::fit_reml_random_slope`] instead of the + /// plain `fit_reml`; `None` leaves the fit on the standard path. + pub random_slope: Option, pub known_loci: Option, pub null_model_path: Option, pub run_mode: RunMode, @@ -371,12 +376,12 @@ impl<'a> StaarPipeline<'a> { store: &GenoStoreResult, pheno: &PhenoStageOut, ) -> Result<(), CohortError> { - // Multi-trait joint STAAR is unrelated continuous only; the - // binary reject fires inside `load_phenotype` via the trait-type - // detection, so by the time we reach this method `pheno.y` is an - // `(n, k)` continuous matrix with `k >= 2`. Fit the joint null - // directly from the multi module. - let null = self.stage(Stage::FitNullModel, |p| p.stage_fit_multi_null(pheno))?; + // `load_phenotype` enforces a uniform family across all k traits; + // `stage_fit_multi_null` dispatches to gaussian vs binomial and + // further to the kinship-aware null (GMMAT `fit_null_glmmkin_multi` + // port) when `--kinship` is set. + let null = + self.stage(Stage::FitNullModel, |p| p.stage_fit_multi_null(pheno, store))?; let scoring = self.stage(Stage::RunScoring, |p| { p.stage_run_multi_scoring(store, &null, pheno) @@ -387,7 +392,14 @@ impl<'a> StaarPipeline<'a> { let n_rare = variants.len() as i64; self.stage(Stage::WriteResults, |p| { - p.stage_write_multi_results(&scoring, &variants, &null, pheno.n, n_rare) + p.stage_write_multi_results( + &scoring, + &variants, + &null, + pheno.trait_type, + pheno.n, + n_rare, + ) })?; self.manifest.outputs.results_dir = Some(self.config.results_dir()); self.manifest.write(&self.config.output_dir)?; @@ -742,11 +754,33 @@ impl<'a> StaarPipeline<'a> { }; let budget = self.engine.resources().kinship_budget_bytes; - let state = match kind { - NullModelKind::KinshipReml => { + let state = match (kind, self.config.random_slope.as_deref()) { + (NullModelKind::KinshipReml, Some(rs_col)) => { + // Random-slope longitudinal LMM. `load_random_slope` emits + // time values in pheno-compacted order (same alignment + // `load_phenotype` uses for `y` / `x`). + let time_var = staar::kinship::load_random_slope( + self.engine.df(), + &self.config.phenotype, + rs_col, + &store.geno, + &pheno.pheno_mask, + &self.config.column_map, + self.out, + )?; + staar::kinship::fit_reml_random_slope( + &pheno.y, + &pheno.x, + &kinships, + &groups, + &time_var, + budget, + )? + } + (NullModelKind::KinshipReml, None) => { staar::kinship::fit_reml(&pheno.y, &pheno.x, &kinships, &groups, None, budget)? } - NullModelKind::KinshipPql => staar::kinship::fit_pql_glmm( + (NullModelKind::KinshipPql, _) => staar::kinship::fit_pql_glmm( &pheno.y, &pheno.x, &kinships, &groups, self.out, budget, )?, _ => unreachable!("fit_kinship_null_model called with non-kinship kind"), @@ -774,19 +808,68 @@ impl<'a> StaarPipeline<'a> { }) } - /// Fit the joint multi-trait null model. Unrelated continuous only; - /// the binary and kinship combinations are rejected at config build - /// time, and the binary-first-trait reject fires inside - /// `load_phenotype` before this stage ever runs. + /// Fit the joint multi-trait null model. Family dispatch (gaussian + /// vs binomial) reads `pheno.trait_type`, which `load_phenotype` + /// enforces to be uniform across all traits in a multi-trait run. + /// When `--kinship` is set, the continuous path routes to + /// `MultiNull::fit_with_kinship` (port of GMMAT's + /// `fit_null_glmmkin_multi`). Binary + kinship in multi-trait is + /// not yet supported — the binary IRLS lift does not compose with + /// the Kronecker-expanded kinship list. fn stage_fit_multi_null( &mut self, pheno: &PhenoStageOut, - ) -> Result { - self.out.status("Fitting joint multi-trait null model..."); - let null = MultiNullContinuous::fit(&pheno.y, &pheno.x); + store: &GenoStoreResult, + ) -> Result { + let has_kinship = !self.config.kinship.is_empty(); + let null = match (pheno.trait_type, has_kinship) { + (TraitType::Continuous, false) => { + self.out.status("Fitting joint multi-trait null model (gaussian)..."); + MultiNull::fit(&pheno.y, &pheno.x) + } + (TraitType::Binary, false) => { + self.out.status("Fitting joint multi-trait null model (binomial)..."); + MultiNull::fit_binary(&pheno.y, &pheno.x, 25) + } + (TraitType::Continuous, true) => { + self.out.status("Fitting joint multi-trait null model (gaussian + kinship)..."); + let kinships = staar::kinship::load_kinship( + &self.config.kinship, + &pheno.compact_samples, + self.out, + )?; + let groups = match self.config.kinship_groups.as_deref() { + Some(col) => staar::kinship::load_groups( + self.engine.df(), + &self.config.phenotype, + col, + &store.geno, + &pheno.pheno_mask, + &self.config.column_map, + self.out, + )?, + None => staar::kinship::GroupPartition::single(pheno.n), + }; + MultiNull::fit_with_kinship(&pheno.y, &pheno.x, &kinships, &groups)? + } + (TraitType::Binary, true) => { + return Err(CohortError::Input( + "Multi-trait binary + kinship is not supported yet. Run each binary trait \ + separately on the single-trait PQL-GLMM path, or drop --kinship for joint \ + binary multi-trait." + .into(), + )); + } + }; self.out.status(&format!( - " k = {} traits, n = {} samples, Σ_res fitted", - null.n_pheno, null.n_samples, + " k = {} traits, n = {} samples{}", + null.n_pheno, + null.n_samples, + if null.kinship.is_some() { + ", kinship-aware null fitted" + } else { + ", Σ_res fitted" + }, )); Ok(null) } @@ -794,7 +877,7 @@ impl<'a> StaarPipeline<'a> { fn stage_run_multi_scoring( &mut self, store: &GenoStoreResult, - null: &MultiNullContinuous, + null: &MultiNull, pheno: &PhenoStageOut, ) -> Result { let request = MultiScoringRequest { @@ -821,7 +904,8 @@ impl<'a> StaarPipeline<'a> { &mut self, scoring: &ScoringOutput, _variants: &[AnnotatedVariant], - null: &MultiNullContinuous, + null: &MultiNull, + trait_type: TraitType, n: usize, n_rare: i64, ) -> Result<(), CohortError> { @@ -843,7 +927,7 @@ impl<'a> StaarPipeline<'a> { self.config.maf_cutoff, &results_dir, NullMeta::Multi(null), - TraitType::Continuous, + trait_type, n, n_rare, self.config.known_loci.is_some(), @@ -1127,6 +1211,7 @@ mod tests { }, kinship: Vec::new(), kinship_groups: None, + random_slope: None, known_loci: None, null_model_path: None, run_mode: RunMode::Analyze, diff --git a/src/staar/scoring.rs b/src/staar/scoring.rs index 9985ff9..bbe1043 100644 --- a/src/staar/scoring.rs +++ b/src/staar/scoring.rs @@ -20,7 +20,7 @@ use crate::output::Output; use crate::staar::carrier::sparse_score::{self, carriers_to_dense_compact}; use crate::staar::carrier::AnalysisVectors; use crate::staar::masks::{self, MaskGroup}; -use crate::staar::multi::{self, MultiNullContinuous}; +use crate::staar::multi::{self, MultiNull}; use crate::staar::score; use crate::staar::{self, GeneResult, MaskCategory, MaskType, ScoringMode}; use crate::store::cache::score_cache::{self, ChromScoreCache, GeneKBlock}; @@ -929,7 +929,7 @@ pub fn run_multi_score_tests( cohort: &CohortHandle<'_>, manifest: &CohortManifest, request: &MultiScoringRequest<'_>, - null: &MultiNullContinuous, + null: &MultiNull, pheno_mask: &[bool], out: &dyn Output, ) -> Result { @@ -1009,7 +1009,7 @@ fn score_chrom_genes_multi( view: &ChromosomeView<'_>, vcf_to_pheno: &[Option], n_pheno: usize, - null: &MultiNullContinuous, + null: &MultiNull, mask_predicates: &[(MaskType, MaskPredicate)], maf_cutoff: f64, chrom: Chromosome, @@ -1129,7 +1129,7 @@ fn score_chrom_windows_multi( view: &ChromosomeView<'_>, vcf_to_pheno: &[Option], n_pheno: usize, - null: &MultiNullContinuous, + null: &MultiNull, request: &MultiScoringRequest<'_>, chrom: Chromosome, plan: &mut MaskPlan, @@ -1210,7 +1210,7 @@ fn score_one_window_multi( view: &ChromosomeView<'_>, vcf_to_pheno: &[Option], n_pheno: usize, - null: &MultiNullContinuous, + null: &MultiNull, n_vcf: usize, ) -> Option { let m = group.variant_indices.len();