From 83670a375b6be0b8711a436b7461d336e54d90d2 Mon Sep 17 00:00:00 2001 From: Vineet Date: Thu, 16 Apr 2026 17:24:10 -0400 Subject: [PATCH] staar: _cond suffix on conditional output files (#102) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The conditional math already runs today — augment_covariates at pipeline.rs:581-588 expands X with known_loci before the null is fit, and by the OLS projection identity that is mathematically equivalent to R's method_cond="optimal" two-stage procedure (regress y~X, then residuals on [known_loci, X]). Downstream score tests see the Schur-complement- projected K = G'(I - H_aug)G automatically. What was missing: the output files didn't say so. This PR suffixes mask-level outputs with _cond and sets meta.conditional=true when --known-loci is active, matching STAARpipelineSummary's file layout (coding_plof.parquet vs coding_plof_cond.parquet) so the summary tooling can tell conditional from unconditional runs. SPA + --known-loci now emits a warning: SPA reads raw G not the augmented projection, so gene-mask p-values in that combo are SPA- adjusted but not conditional. The cond-SPA variant (Gene_Centric_*_cond_spa in R) is a follow-up. 296/296, clippy clean. --- src/staar/output.rs | 10 ++++++++-- src/staar/pipeline.rs | 16 +++++++++++++++- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/src/staar/output.rs b/src/staar/output.rs index 11d73e9..a333d6e 100644 --- a/src/staar/output.rs +++ b/src/staar/output.rs @@ -103,9 +103,11 @@ pub struct IndividualRow { pub fn write_individual_results( rows: &[IndividualRow], output_dir: &Path, + is_conditional: bool, out: &dyn Output, ) -> Result<(), CohortError> { - let out_path = output_dir.join("individual.parquet"); + let stem = if is_conditional { "individual_cond" } else { "individual" }; + let out_path = output_dir.join(format!("{stem}.parquet")); let n = rows.len(); // R orders by POS ascending before returning (Individual_Analysis.R:450). @@ -381,8 +383,10 @@ pub fn write_results( trait_type: TraitType, n: usize, n_rare: i64, + is_conditional: bool, out: &dyn Output, ) -> Result<(), CohortError> { + let cond_suffix = if is_conditional { "_cond" } else { "" }; out.status("Writing results..."); let mut significant_genes: Vec = Vec::new(); @@ -397,7 +401,8 @@ pub fn write_results( if results.is_empty() { continue; } - let out_path = output_dir.join(format!("{}.parquet", mask_type.file_stem())); + let out_path = + output_dir.join(format!("{}{cond_suffix}.parquet", mask_type.file_stem())); let nan_pvals = results.iter().filter(|r| r.staar.staar_o.is_nan()).count(); if nan_pvals > 0 { @@ -453,6 +458,7 @@ pub fn write_results( "cohort_staar_version": 1, "traits": trait_names, "trait_type": format!("{:?}", trait_type), "n_samples": n, "n_rare_variants": n_rare, "maf_cutoff": maf_cutoff, + "conditional": is_conditional, "significant_genes": significant_genes, }); // Merge null-model summary fields (sigma2 for single-trait, or the diff --git a/src/staar/pipeline.rs b/src/staar/pipeline.rs index fbedcc0..6d33ed8 100644 --- a/src/staar/pipeline.rs +++ b/src/staar/pipeline.rs @@ -825,6 +825,7 @@ impl<'a> StaarPipeline<'a> { TraitType::Continuous, n, n_rare, + self.config.known_loci.is_some(), self.out, ) } @@ -986,8 +987,15 @@ impl<'a> StaarPipeline<'a> { )) })?; + let is_conditional = self.config.known_loci.is_some(); + if self.config.individual && !scoring.individual.is_empty() { - write_individual_results(&scoring.individual, &results_dir, self.out)?; + write_individual_results( + &scoring.individual, + &results_dir, + is_conditional, + self.out, + )?; } write_results( @@ -1001,6 +1009,7 @@ impl<'a> StaarPipeline<'a> { trait_type, n, n_rare, + is_conditional, self.out, ) } @@ -1019,6 +1028,11 @@ impl<'a> StaarPipeline<'a> { "--spa with --kinship: SPA is applied at the score-test layer; the kinship-aware path provides exact variance via the GLMM projection.", ); } + if self.config.known_loci.is_some() && mode == ScoringMode::Spa { + self.out.warn( + "--spa with --known-loci: the known-loci adjustment is absorbed into the fitted null, but the SPA saddlepoint reads raw G; gene-mask p-values from this run are SPA-adjusted but not conditional. A separate --cond-spa path is tracked in STAARpipeline as Gene_Centric_*_cond_spa.", + ); + } } fn cohort(&self) -> CohortHandle<'_> {