diff --git a/src/column.rs b/src/column.rs
index 0741bb5..1499701 100644
--- a/src/column.rs
+++ b/src/column.rs
@@ -15,6 +15,8 @@ pub enum Col {
CaddPhred,
Revel,
+ MetaSvmPred,
+ GeneHancer,
Vid,
VariantVcf,
@@ -52,6 +54,8 @@ impl Col {
Col::Consequence => "consequence",
Col::CaddPhred => "cadd_phred",
Col::Revel => "revel",
+ Col::MetaSvmPred => "metasvm_pred",
+ Col::GeneHancer => "genehancer",
Col::IsCagePromoter => "is_cage_promoter",
Col::IsCageEnhancer => "is_cage_enhancer",
Col::IsCcrePromoter => "is_ccre_promoter",
@@ -152,7 +156,7 @@ pub const STAAR_PHRED_SOURCES: [(&str, &str); 11] = [
),
];
-pub const STORE_METADATA_COLS: [Col; 13] = [
+pub const STORE_METADATA_COLS: [Col; 15] = [
Col::Position,
Col::EndPosition,
Col::RefAllele,
@@ -162,12 +166,20 @@ pub const STORE_METADATA_COLS: [Col; 13] = [
Col::RegionType,
Col::Consequence,
Col::Revel,
+ Col::MetaSvmPred,
+ Col::GeneHancer,
Col::IsCagePromoter,
Col::IsCageEnhancer,
Col::IsCcrePromoter,
Col::IsCcreEnhancer,
];
+/// Structural annotation columns beyond the 11 weight channels. Required
+/// for mask predicates that match STAARpipeline (disruptive_missense,
+/// plof_ds, ptv_ds via MetaSVM; GeneHancer pass-through for downstream
+/// tooling). Locked at cohort preflight.
+pub const STAAR_STRUCTURAL_COLS: [Col; 2] = [Col::MetaSvmPred, Col::GeneHancer];
+
pub fn store_columns() -> Vec
{
let mut cols = STORE_METADATA_COLS.to_vec();
cols.extend_from_slice(&STAAR_PHRED_CHANNELS);
@@ -213,6 +225,14 @@ pub static ANNOTATION_EXTRACTS: &[AnnotationExtract] = &[
output: Col::Revel,
sql: "CAST(COALESCE(a.dbnsfp.revel, 0.0) AS DOUBLE)",
},
+ AnnotationExtract {
+ output: Col::MetaSvmPred,
+ sql: "COALESCE(a.dbnsfp.metasvm_pred, '')",
+ },
+ AnnotationExtract {
+ output: Col::GeneHancer,
+ sql: "COALESCE(a.genehancer.id, '')",
+ },
AnnotationExtract {
output: Col::IsCagePromoter,
sql: "a.cage.cage_promoter IS NOT NULL",
@@ -473,6 +493,8 @@ pub static VARIANT_STORE_CONTRACT: SchemaContract = SchemaContract {
(Col::Consequence, ColType::Utf8),
(Col::CaddPhred, ColType::Float64),
(Col::Revel, ColType::Float64),
+ (Col::MetaSvmPred, ColType::Utf8),
+ (Col::GeneHancer, ColType::Utf8),
(Col::IsCagePromoter, ColType::Boolean),
(Col::IsCageEnhancer, ColType::Boolean),
(Col::IsCcrePromoter, ColType::Boolean),
@@ -550,7 +572,7 @@ mod tests {
#[test]
fn store_columns_complete() {
let cols = store_columns();
- assert_eq!(cols.len(), 24); // 13 metadata + 11 phred channels
+ assert_eq!(cols.len(), 26); // 15 metadata + 11 phred channels
}
#[test]
diff --git a/src/staar/masks.rs b/src/staar/masks.rs
index c1a117e..46af91b 100644
--- a/src/staar/masks.rs
+++ b/src/staar/masks.rs
@@ -297,7 +297,8 @@ fn v(
ccre_enh: bool,
) -> AnnotatedVariant {
use crate::types::{
- AnnotationWeights, Chromosome, FunctionalAnnotation, RegionType, RegulatoryFlags,
+ AnnotationWeights, Chromosome, FunctionalAnnotation, MetaSvmPred, RegionType,
+ RegulatoryFlags,
};
AnnotatedVariant {
chromosome: Chromosome::Autosome(22),
@@ -306,11 +307,13 @@ fn v(
alt_allele: "T".into(),
maf: 0.001,
gene_name: "BRCA2".into(),
+ genehancer: "".into(),
annotation: FunctionalAnnotation {
region_type: RegionType::from_str_lossy(region_type),
consequence: Consequence::from_str_lossy(consequence),
cadd_phred: cadd,
revel,
+ metasvm_pred: MetaSvmPred::Unknown,
weights: AnnotationWeights([0.0; 11]),
regulatory: RegulatoryFlags {
cage_promoter: cage_prom,
@@ -681,7 +684,8 @@ mod tests {
fn var_at(pos: u32) -> AnnotatedVariant {
use crate::types::{
- AnnotationWeights, Chromosome, FunctionalAnnotation, RegionType, RegulatoryFlags,
+ AnnotationWeights, Chromosome, FunctionalAnnotation, MetaSvmPred, RegionType,
+ RegulatoryFlags,
};
AnnotatedVariant {
chromosome: Chromosome::Autosome(1),
@@ -690,11 +694,13 @@ mod tests {
alt_allele: "T".into(),
maf: 0.005,
gene_name: "".into(),
+ genehancer: "".into(),
annotation: FunctionalAnnotation {
region_type: RegionType::Unknown,
consequence: Consequence::Unknown,
cadd_phred: 10.0,
revel: 0.0,
+ metasvm_pred: MetaSvmPred::Unknown,
weights: AnnotationWeights([0.5; 11]),
regulatory: RegulatoryFlags::default(),
},
diff --git a/src/staar/meta.rs b/src/staar/meta.rs
index 4878485..c187c47 100644
--- a/src/staar/meta.rs
+++ b/src/staar/meta.rs
@@ -46,7 +46,7 @@ use crate::store::cohort::{ChromosomeView, CohortId};
use crate::store::list::parquet_column_names;
use crate::types::{
AnnotatedVariant, AnnotationWeights, Chromosome, Consequence, FunctionalAnnotation,
- MetaVariant, RegionType, RegulatoryFlags,
+ MetaSvmPred, MetaVariant, RegionType, RegulatoryFlags,
};
const META_SUMSTATS_COLUMNS: &[ColumnRequirement] = &[
@@ -399,6 +399,8 @@ pub fn merge_chromosome(
first_value({csq}) FILTER (WHERE {csq} != '') AS {csq}, \
first_value(cadd_phred_raw) AS cadd_phred_raw, \
first_value({revel}) AS {revel}, \
+ first_value({msp}) FILTER (WHERE {msp} != '') AS {msp}, \
+ first_value({gh}) FILTER (WHERE {gh} != '') AS {gh}, \
bool_or({cage_p}) AS {cage_p}, \
bool_or({cage_e}) AS {cage_e}, \
bool_or({ccre_p}) AS {ccre_p}, \
@@ -415,6 +417,7 @@ pub fn merge_chromosome(
maf = Col::Maf,
gene = Col::GeneName, region = Col::RegionType, csq = Col::Consequence,
revel = Col::Revel,
+ msp = Col::MetaSvmPred, gh = Col::GeneHancer,
cage_p = Col::IsCagePromoter, cage_e = Col::IsCageEnhancer,
ccre_p = Col::IsCcrePromoter, ccre_e = Col::IsCcreEnhancer,
))?;
@@ -422,7 +425,7 @@ pub fn merge_chromosome(
let batches = engine.collect(&format!(
"SELECT {pos}, {ref_a}, {alt_a}, u_meta, \
mac_total, n_total, {gene}, {region}, {csq}, \
- cadd_phred_raw, {revel}, \
+ cadd_phred_raw, {revel}, {msp}, {gh}, \
{cage_p}, {cage_e}, {ccre_p}, {ccre_e}, \
{weight_select}, \
study_segs \
@@ -434,6 +437,8 @@ pub fn merge_chromosome(
region = Col::RegionType,
csq = Col::Consequence,
revel = Col::Revel,
+ msp = Col::MetaSvmPred,
+ gh = Col::GeneHancer,
cage_p = Col::IsCagePromoter,
cage_e = Col::IsCageEnhancer,
ccre_p = Col::IsCcrePromoter,
@@ -492,19 +497,21 @@ pub fn merge_chromosome(
let csq_arr = str_col(8)?;
let cadd_raw_arr = f64_col(9)?;
let revel_arr = f64_col(10)?;
- let cp_arr = bool_col(11)?;
- let ce_arr = bool_col(12)?;
- let crp_arr = bool_col(13)?;
- let cre_arr = bool_col(14)?;
+ let msp_arr = str_col(11)?;
+ let gh_arr = str_col(12)?;
+ let cp_arr = bool_col(13)?;
+ let ce_arr = bool_col(14)?;
+ let crp_arr = bool_col(15)?;
+ let cre_arr = bool_col(16)?;
// Sumstats parquet stores transformed [0,1] weights under the
// FAVOR-native channel names; pass them through so the
// PHRED -> probability transform is not applied a second time.
let mut w_arrs: Vec<&Float64Array> = Vec::with_capacity(11);
for i in 0..11 {
- w_arrs.push(f64_col(15 + i)?);
+ w_arrs.push(f64_col(17 + i)?);
}
- let segs_arr = str_col(26)?;
+ let segs_arr = str_col(28)?;
for i in 0..n {
let mut weights = [0.0f64; 11];
@@ -532,11 +539,13 @@ pub fn merge_chromosome(
alt_allele: str_or(alt_arr, i, "").into(),
maf,
gene_name: str_or(gene_arr, i, "").into(),
+ genehancer: str_or(gh_arr, i, "").into(),
annotation: FunctionalAnnotation {
region_type: RegionType::from_str_lossy(str_or(rt_arr, i, "")),
consequence: Consequence::from_str_lossy(str_or(csq_arr, i, "")),
cadd_phred,
revel: f64_or(revel_arr, i, 0.0),
+ metasvm_pred: MetaSvmPred::from_str_lossy(str_or(msp_arr, i, "")),
regulatory: RegulatoryFlags {
cage_promoter: bool_or(cp_arr, i, false),
cage_enhancer: bool_or(ce_arr, i, false),
@@ -899,6 +908,8 @@ fn variant_schema() -> Schema {
Field::new(Col::Consequence.as_str(), DataType::Utf8, false),
Field::new("cadd_phred_raw", DataType::Float64, false),
Field::new(Col::Revel.as_str(), DataType::Float64, false),
+ Field::new(Col::MetaSvmPred.as_str(), DataType::Utf8, false),
+ Field::new(Col::GeneHancer.as_str(), DataType::Utf8, false),
Field::new(Col::IsCagePromoter.as_str(), DataType::Boolean, false),
Field::new(Col::IsCageEnhancer.as_str(), DataType::Boolean, false),
Field::new(Col::IsCcrePromoter.as_str(), DataType::Boolean, false),
@@ -964,6 +975,8 @@ struct SumstatsWriter {
consequence: StringBuilder,
cadd: Float64Builder,
revel: Float64Builder,
+ metasvm_pred: StringBuilder,
+ genehancer: StringBuilder,
cage_prom: BooleanBuilder,
cage_enh: BooleanBuilder,
ccre_prom: BooleanBuilder,
@@ -996,6 +1009,8 @@ impl SumstatsWriter {
consequence: StringBuilder::with_capacity(total_variants, total_variants * 8),
cadd: Float64Builder::with_capacity(total_variants),
revel: Float64Builder::with_capacity(total_variants),
+ metasvm_pred: StringBuilder::with_capacity(total_variants, total_variants),
+ genehancer: StringBuilder::with_capacity(total_variants, total_variants * 8),
cage_prom: BooleanBuilder::with_capacity(total_variants),
cage_enh: BooleanBuilder::with_capacity(total_variants),
ccre_prom: BooleanBuilder::with_capacity(total_variants),
@@ -1043,6 +1058,9 @@ impl SumstatsWriter {
.append_value(v.annotation.consequence.as_str());
self.cadd.append_value(v.annotation.cadd_phred);
self.revel.append_value(v.annotation.revel);
+ self.metasvm_pred
+ .append_value(v.annotation.metasvm_pred.as_str());
+ self.genehancer.append_value(&v.genehancer);
self.cage_prom
.append_value(v.annotation.regulatory.cage_promoter);
self.cage_enh
@@ -1111,6 +1129,8 @@ impl SumstatsWriter {
Arc::new(self.consequence.finish()),
Arc::new(self.cadd.finish()),
Arc::new(self.revel.finish()),
+ Arc::new(self.metasvm_pred.finish()),
+ Arc::new(self.genehancer.finish()),
Arc::new(self.cage_prom.finish()),
Arc::new(self.cage_enh.finish()),
Arc::new(self.ccre_prom.finish()),
@@ -1458,11 +1478,13 @@ mod tests {
alt_allele: alt_b.into(),
maf,
gene_name: "GENE".into(),
+ genehancer: "".into(),
annotation: FunctionalAnnotation {
region_type: RegionType::Exonic,
consequence: Consequence::NonsynonymousSNV,
cadd_phred: 20.0,
revel: 0.5,
+ metasvm_pred: MetaSvmPred::Unknown,
regulatory: RegulatoryFlags::default(),
weights: AnnotationWeights([1.0; 11]),
},
diff --git a/src/staar/pipeline.rs b/src/staar/pipeline.rs
index 3117a6a..fbedcc0 100644
--- a/src/staar/pipeline.rs
+++ b/src/staar/pipeline.rs
@@ -454,6 +454,7 @@ impl<'a> StaarPipeline<'a> {
let weight_cols: Vec = STAAR_PHRED_CHANNELS.to_vec();
ann_vs.supports(&weight_cols)?;
ann_vs.require_staar_weight_catalog()?;
+ ann_vs.require_structural_annotation_catalog()?;
// Raw annotation column contract.
let contract = ColumnContract {
diff --git a/src/store/cohort/builder.rs b/src/store/cohort/builder.rs
index 1227dd6..1099f66 100644
--- a/src/store/cohort/builder.rs
+++ b/src/store/cohort/builder.rs
@@ -402,6 +402,8 @@ fn write_variants_parquet(
region_type: Box,
consequence: Box,
revel: f64,
+ metasvm_pred: Box,
+ genehancer: Box,
cage_prom: bool,
cage_enh: bool,
ccre_prom: bool,
@@ -419,6 +421,8 @@ fn write_variants_parquet(
let rt_arr = str_col_by_name(batch, Col::RegionType.as_str())?;
let csq_arr = str_col_by_name(batch, Col::Consequence.as_str())?;
let revel_arr = col_by_name::(batch, Col::Revel.as_str())?;
+ let msp_arr = str_col_by_name(batch, Col::MetaSvmPred.as_str())?;
+ let gh_arr = str_col_by_name(batch, Col::GeneHancer.as_str())?;
let cps = col_by_name::(batch, Col::IsCagePromoter.as_str())?;
let ces = col_by_name::(batch, Col::IsCageEnhancer.as_str())?;
let crps = col_by_name::(batch, Col::IsCcrePromoter.as_str())?;
@@ -449,6 +453,8 @@ fn write_variants_parquet(
region_type: rt_arr.value(i).into(),
consequence: csq_arr.value(i).into(),
revel: revel_arr.value(i),
+ metasvm_pred: msp_arr.value(i).into(),
+ genehancer: gh_arr.value(i).into(),
cage_prom: cps.value(i),
cage_enh: ces.value(i),
ccre_prom: crps.value(i),
@@ -470,6 +476,8 @@ fn write_variants_parquet(
let mut rt_b = StringBuilder::with_capacity(n, n * 16);
let mut csq_b = StringBuilder::with_capacity(n, n * 16);
let mut revel_b = Float64Builder::with_capacity(n);
+ let mut msp_b = StringBuilder::with_capacity(n, n);
+ let mut gh_b = StringBuilder::with_capacity(n, n * 8);
let mut cp_b = BooleanBuilder::with_capacity(n);
let mut ce_b = BooleanBuilder::with_capacity(n);
let mut crp_b = BooleanBuilder::with_capacity(n);
@@ -498,6 +506,8 @@ fn write_variants_parquet(
rt_b.append_value(&*m.region_type);
csq_b.append_value(&*m.consequence);
revel_b.append_value(m.revel);
+ msp_b.append_value(&*m.metasvm_pred);
+ gh_b.append_value(&*m.genehancer);
cp_b.append_value(m.cage_prom);
ce_b.append_value(m.cage_enh);
crp_b.append_value(m.ccre_prom);
@@ -511,6 +521,8 @@ fn write_variants_parquet(
rt_b.append_value("");
csq_b.append_value("");
revel_b.append_value(0.0);
+ msp_b.append_value("");
+ gh_b.append_value("");
cp_b.append_value(false);
ce_b.append_value(false);
crp_b.append_value(false);
@@ -535,6 +547,8 @@ fn write_variants_parquet(
Field::new(Col::RegionType.as_str(), DataType::Utf8, false),
Field::new(Col::Consequence.as_str(), DataType::Utf8, false),
Field::new(Col::Revel.as_str(), DataType::Float64, false),
+ Field::new(Col::MetaSvmPred.as_str(), DataType::Utf8, false),
+ Field::new(Col::GeneHancer.as_str(), DataType::Utf8, false),
Field::new(Col::IsCagePromoter.as_str(), DataType::Boolean, false),
Field::new(Col::IsCageEnhancer.as_str(), DataType::Boolean, false),
Field::new(Col::IsCcrePromoter.as_str(), DataType::Boolean, false),
@@ -556,6 +570,8 @@ fn write_variants_parquet(
Arc::new(rt_b.finish()),
Arc::new(csq_b.finish()),
Arc::new(revel_b.finish()),
+ Arc::new(msp_b.finish()),
+ Arc::new(gh_b.finish()),
Arc::new(cp_b.finish()),
Arc::new(ce_b.finish()),
Arc::new(crp_b.finish()),
diff --git a/src/store/cohort/variants.rs b/src/store/cohort/variants.rs
index cfb2009..d8417b6 100644
--- a/src/store/cohort/variants.rs
+++ b/src/store/cohort/variants.rs
@@ -6,8 +6,8 @@ use std::path::Path;
use crate::error::CohortError;
use crate::store::backend::{Backend, BoxedBatchReader};
use crate::types::{
- AnnotatedVariant, AnnotationWeights, Chromosome, Consequence, FunctionalAnnotation, RegionType,
- RegulatoryFlags,
+ AnnotatedVariant, AnnotationWeights, Chromosome, Consequence, FunctionalAnnotation, MetaSvmPred,
+ RegionType, RegulatoryFlags,
};
#[derive(Clone, Copy, Debug)]
@@ -42,6 +42,8 @@ pub struct VariantIndexEntry {
pub consequence: Consequence,
pub cadd_phred: f64,
pub revel: f64,
+ pub metasvm_pred: MetaSvmPred,
+ pub genehancer: Box,
pub regulatory: RegulatoryFlags,
pub weights: AnnotationWeights,
}
@@ -55,11 +57,13 @@ impl VariantIndexEntry {
alt_allele: self.alt_allele.clone(),
maf: self.maf,
gene_name: "".into(), // gene is a derived property, not stored per-variant
+ genehancer: self.genehancer.clone(),
annotation: FunctionalAnnotation {
region_type: self.region_type,
consequence: self.consequence,
cadd_phred: self.cadd_phred,
revel: self.revel,
+ metasvm_pred: self.metasvm_pred,
regulatory: self.regulatory,
weights: self.weights,
},
@@ -188,6 +192,8 @@ fn load_variant_entries(reader: BoxedBatchReader) -> Result(&batch, Col::RegionType.as_str())?;
let csq_arr = col_by_name::(&batch, Col::Consequence.as_str())?;
let revel_arr = col_by_name::(&batch, Col::Revel.as_str())?;
+ let msp_arr = col_by_name::(&batch, Col::MetaSvmPred.as_str())?;
+ let gh_arr = col_by_name::(&batch, Col::GeneHancer.as_str())?;
let cp_arr = col_by_name::(&batch, Col::IsCagePromoter.as_str())?;
let ce_arr = col_by_name::(&batch, Col::IsCageEnhancer.as_str())?;
let crp_arr = col_by_name::(&batch, Col::IsCcrePromoter.as_str())?;
@@ -223,6 +229,8 @@ fn load_variant_entries(reader: BoxedBatchReader) -> Result Result<(), CohortError> {
+ let have = self.columns();
+ let missing: Vec<&str> = crate::column::STAAR_STRUCTURAL_COLS
+ .iter()
+ .map(|c| c.as_str())
+ .filter(|name| !have.iter().any(|h| h == name))
+ .collect();
+ if missing.is_empty() {
+ return Ok(());
+ }
+ Err(CohortError::DataMissing(format!(
+ "Cohort store at {} is missing structural annotation columns:\n {}\n\
+ Required for MetaSVM-driven coding masks and GeneHancer pass-through.\n\
+ Rebuild the cohort: `favor ingest --annotations --cohort-id `.",
+ self.root.display(),
+ missing.join(", "),
+ )))
+ }
}
pub struct VariantSetWriter {
diff --git a/src/test_fixtures.rs b/src/test_fixtures.rs
index 0475a14..e6ad327 100644
--- a/src/test_fixtures.rs
+++ b/src/test_fixtures.rs
@@ -1,8 +1,8 @@
use std::path::Path;
use crate::types::{
- AnnotatedVariant, AnnotationWeights, Chromosome, Consequence, FunctionalAnnotation, RegionType,
- RegulatoryFlags,
+ AnnotatedVariant, AnnotationWeights, Chromosome, Consequence, FunctionalAnnotation,
+ MetaSvmPred, RegionType, RegulatoryFlags,
};
// Raw FAVOR PHRED values for the OR11H1 stopgain on chr22. These go into
@@ -33,11 +33,13 @@ pub fn base_variant() -> AnnotatedVariant {
alt_allele: "T".into(),
maf: 0.0007,
gene_name: "OR11H1".into(),
+ genehancer: "".into(),
annotation: FunctionalAnnotation {
region_type: RegionType::Exonic,
consequence: Consequence::Stopgain,
cadd_phred: 23.7,
revel: 0.0,
+ metasvm_pred: MetaSvmPred::Unknown,
weights: AnnotationWeights(GROUND_TRUTH_PHREDS),
regulatory: RegulatoryFlags::default(),
},
@@ -336,7 +338,8 @@ fn annotation_rows_sql() -> String {
named_struct('region_type', '{rt}', 'genes', make_array('{gene}'), \
'consequence', {csq}) AS gencode, \
named_struct('cadd', named_struct('raw', CAST(0.0 AS FLOAT), 'phred', CAST({cadd} AS FLOAT))) AS main, \
- named_struct('revel', {revel}) AS dbnsfp, \
+ named_struct('revel', {revel}, 'metasvm_pred', CAST(NULL AS VARCHAR)) AS dbnsfp, \
+ named_struct('id', CAST(NULL AS VARCHAR)) AS genehancer, \
CAST({linsight} AS FLOAT) AS linsight, \
CAST({fathmm} AS FLOAT) AS fathmm_xf, \
{cage} AS cage, \
@@ -437,6 +440,8 @@ pub fn staar_rare_sql() -> String {
COALESCE(a.gencode.region_type, '') AS {region},
COALESCE(a.gencode.consequence, '') AS {csq},
COALESCE(a.dbnsfp.revel, 0) AS {revel},
+ COALESCE(a.dbnsfp.metasvm_pred, '') AS {msp},
+ COALESCE(a.genehancer.id, '') AS {gh},
a.cage.cage_promoter IS NOT NULL AS {cage_p},
a.cage.cage_enhancer IS NOT NULL AS {cage_e},
COALESCE(CAST(a.ccre.annotations AS VARCHAR) LIKE '%PLS%', false) AS {ccre_p},
@@ -451,6 +456,7 @@ pub fn staar_rare_sql() -> String {
ref_a = Col::RefAllele, alt_a = Col::AltAllele,
gene = Col::GeneName, region = Col::RegionType, csq = Col::Consequence,
revel = Col::Revel,
+ msp = Col::MetaSvmPred, gh = Col::GeneHancer,
cage_p = Col::IsCagePromoter, cage_e = Col::IsCageEnhancer,
ccre_p = Col::IsCcrePromoter, ccre_e = Col::IsCcreEnhancer,
)
@@ -509,13 +515,13 @@ mod tests {
fn read_extracted_variants(engine: &DfEngine) -> Vec {
use crate::column::{Col, STAAR_PHRED_CHANNELS};
- // _rare table columns, in the order the SELECT below fixes:
+ // _rare SELECT layout:
// 0: chromosome
- // 1-7: position, ref, alt, maf, gene, region_type, consequence
- // 8: revel
- // 9-12: cage_prom, cage_enh, ccre_prom, ccre_enh
- // 13-23: 11 FAVOR-native PHRED channels (cadd_phred first)
- // The PHRED -> [0,1] transform matches VariantIndex::load().
+ // 1-8: position, ref, alt, maf, gene, region_type, consequence, revel
+ // 9-10: metasvm_pred, genehancer
+ // 11-14: cage_prom, cage_enh, ccre_prom, ccre_enh
+ // 15-25: 11 FAVOR-native PHRED channels (cadd_phred first)
+ // PHRED -> [0,1] transform matches VariantIndex::load().
let phred_cols = STAAR_PHRED_CHANNELS
.iter()
.map(|c| c.as_str())
@@ -524,7 +530,7 @@ mod tests {
let batches = engine
.collect(&format!(
"SELECT {chr}, {pos}, {ref_a}, {alt_a}, {maf}, \
- {gene}, {region}, {csq}, {revel}, \
+ {gene}, {region}, {csq}, {revel}, {msp}, {gh}, \
{cage_p}, {cage_e}, {ccre_p}, {ccre_e}, \
{phred_cols} \
FROM _rare ORDER BY {pos}",
@@ -537,6 +543,8 @@ mod tests {
region = Col::RegionType,
csq = Col::Consequence,
revel = Col::Revel,
+ msp = Col::MetaSvmPred,
+ gh = Col::GeneHancer,
cage_p = Col::IsCagePromoter,
cage_e = Col::IsCageEnhancer,
ccre_p = Col::IsCcrePromoter,
@@ -549,14 +557,14 @@ mod tests {
for row in 0..batch.num_rows() {
let mut weights = [0.0f64; 11];
for (i, w) in weights.iter_mut().enumerate() {
- let phred = num_col_val(batch.column(13 + i).as_ref(), row);
+ let phred = num_col_val(batch.column(15 + i).as_ref(), row);
*w = if phred > 0.0 && phred.is_finite() {
1.0 - 10f64.powf(-phred / 10.0)
} else {
0.0
};
}
- let cadd_phred = num_col_val(batch.column(13).as_ref(), row);
+ let cadd_phred = num_col_val(batch.column(15).as_ref(), row);
variants.push(AnnotatedVariant {
chromosome: str_col_val(batch.column(0).as_ref(), row)
@@ -567,6 +575,7 @@ mod tests {
alt_allele: str_col_val(batch.column(3).as_ref(), row).into(),
maf: num_col_val(batch.column(4).as_ref(), row),
gene_name: str_col_val(batch.column(5).as_ref(), row).into(),
+ genehancer: str_col_val(batch.column(10).as_ref(), row).into(),
annotation: FunctionalAnnotation {
region_type: RegionType::from_str_lossy(&str_col_val(
batch.column(6).as_ref(),
@@ -578,11 +587,15 @@ mod tests {
)),
cadd_phred,
revel: num_col_val(batch.column(8).as_ref(), row),
+ metasvm_pred: MetaSvmPred::from_str_lossy(&str_col_val(
+ batch.column(9).as_ref(),
+ row,
+ )),
regulatory: RegulatoryFlags {
- cage_promoter: bool_col_val(batch.column(9).as_ref(), row),
- cage_enhancer: bool_col_val(batch.column(10).as_ref(), row),
- ccre_promoter: bool_col_val(batch.column(11).as_ref(), row),
- ccre_enhancer: bool_col_val(batch.column(12).as_ref(), row),
+ cage_promoter: bool_col_val(batch.column(11).as_ref(), row),
+ cage_enhancer: bool_col_val(batch.column(12).as_ref(), row),
+ ccre_promoter: bool_col_val(batch.column(13).as_ref(), row),
+ ccre_enhancer: bool_col_val(batch.column(14).as_ref(), row),
},
weights: AnnotationWeights(weights),
},
diff --git a/src/types.rs b/src/types.rs
index e0177c7..84e89c9 100644
--- a/src/types.rs
+++ b/src/types.rs
@@ -421,6 +421,35 @@ pub struct RegulatoryFlags {
pub ccre_enhancer: bool,
}
+/// dbNSFP MetaSVM ensemble missense prediction. "D" (Deleterious) drives
+/// STAARpipeline's disruptive_missense / plof_ds / ptv_ds predicates; "T"
+/// (Tolerated) and missing (".") collapse to Unknown.
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
+#[repr(u8)]
+pub enum MetaSvmPred {
+ #[default]
+ Unknown,
+ Deleterious,
+ Tolerated,
+}
+
+impl MetaSvmPred {
+ pub fn as_str(self) -> &'static str {
+ match self {
+ Self::Deleterious => "D",
+ Self::Tolerated => "T",
+ Self::Unknown => "",
+ }
+ }
+ pub fn from_str_lossy(s: &str) -> Self {
+ match s {
+ "D" => Self::Deleterious,
+ "T" => Self::Tolerated,
+ _ => Self::Unknown,
+ }
+ }
+}
+
/// Everything STAAR needs to classify (masks) and weight (score tests) a variant.
///
/// Mask predicates access `consequence` / `region_type`.
@@ -431,6 +460,7 @@ pub struct FunctionalAnnotation {
pub consequence: Consequence,
pub cadd_phred: f64,
pub revel: f64,
+ pub metasvm_pred: MetaSvmPred,
pub regulatory: RegulatoryFlags,
pub weights: AnnotationWeights,
}
@@ -447,6 +477,9 @@ pub struct AnnotatedVariant {
pub alt_allele: Box,
pub maf: f64,
pub gene_name: Box,
+ /// Raw GeneHancer identifier string from FAVOR (`a.genehancer.id`).
+ /// Pass-through to sumstats; no mask predicate reads it today.
+ pub genehancer: Box,
pub annotation: FunctionalAnnotation,
}