diff --git a/src/column.rs b/src/column.rs index 0741bb5..1499701 100644 --- a/src/column.rs +++ b/src/column.rs @@ -15,6 +15,8 @@ pub enum Col { CaddPhred, Revel, + MetaSvmPred, + GeneHancer, Vid, VariantVcf, @@ -52,6 +54,8 @@ impl Col { Col::Consequence => "consequence", Col::CaddPhred => "cadd_phred", Col::Revel => "revel", + Col::MetaSvmPred => "metasvm_pred", + Col::GeneHancer => "genehancer", Col::IsCagePromoter => "is_cage_promoter", Col::IsCageEnhancer => "is_cage_enhancer", Col::IsCcrePromoter => "is_ccre_promoter", @@ -152,7 +156,7 @@ pub const STAAR_PHRED_SOURCES: [(&str, &str); 11] = [ ), ]; -pub const STORE_METADATA_COLS: [Col; 13] = [ +pub const STORE_METADATA_COLS: [Col; 15] = [ Col::Position, Col::EndPosition, Col::RefAllele, @@ -162,12 +166,20 @@ pub const STORE_METADATA_COLS: [Col; 13] = [ Col::RegionType, Col::Consequence, Col::Revel, + Col::MetaSvmPred, + Col::GeneHancer, Col::IsCagePromoter, Col::IsCageEnhancer, Col::IsCcrePromoter, Col::IsCcreEnhancer, ]; +/// Structural annotation columns beyond the 11 weight channels. Required +/// for mask predicates that match STAARpipeline (disruptive_missense, +/// plof_ds, ptv_ds via MetaSVM; GeneHancer pass-through for downstream +/// tooling). Locked at cohort preflight. +pub const STAAR_STRUCTURAL_COLS: [Col; 2] = [Col::MetaSvmPred, Col::GeneHancer]; + pub fn store_columns() -> Vec { let mut cols = STORE_METADATA_COLS.to_vec(); cols.extend_from_slice(&STAAR_PHRED_CHANNELS); @@ -213,6 +225,14 @@ pub static ANNOTATION_EXTRACTS: &[AnnotationExtract] = &[ output: Col::Revel, sql: "CAST(COALESCE(a.dbnsfp.revel, 0.0) AS DOUBLE)", }, + AnnotationExtract { + output: Col::MetaSvmPred, + sql: "COALESCE(a.dbnsfp.metasvm_pred, '')", + }, + AnnotationExtract { + output: Col::GeneHancer, + sql: "COALESCE(a.genehancer.id, '')", + }, AnnotationExtract { output: Col::IsCagePromoter, sql: "a.cage.cage_promoter IS NOT NULL", @@ -473,6 +493,8 @@ pub static VARIANT_STORE_CONTRACT: SchemaContract = SchemaContract { (Col::Consequence, ColType::Utf8), (Col::CaddPhred, ColType::Float64), (Col::Revel, ColType::Float64), + (Col::MetaSvmPred, ColType::Utf8), + (Col::GeneHancer, ColType::Utf8), (Col::IsCagePromoter, ColType::Boolean), (Col::IsCageEnhancer, ColType::Boolean), (Col::IsCcrePromoter, ColType::Boolean), @@ -550,7 +572,7 @@ mod tests { #[test] fn store_columns_complete() { let cols = store_columns(); - assert_eq!(cols.len(), 24); // 13 metadata + 11 phred channels + assert_eq!(cols.len(), 26); // 15 metadata + 11 phred channels } #[test] diff --git a/src/staar/masks.rs b/src/staar/masks.rs index c1a117e..46af91b 100644 --- a/src/staar/masks.rs +++ b/src/staar/masks.rs @@ -297,7 +297,8 @@ fn v( ccre_enh: bool, ) -> AnnotatedVariant { use crate::types::{ - AnnotationWeights, Chromosome, FunctionalAnnotation, RegionType, RegulatoryFlags, + AnnotationWeights, Chromosome, FunctionalAnnotation, MetaSvmPred, RegionType, + RegulatoryFlags, }; AnnotatedVariant { chromosome: Chromosome::Autosome(22), @@ -306,11 +307,13 @@ fn v( alt_allele: "T".into(), maf: 0.001, gene_name: "BRCA2".into(), + genehancer: "".into(), annotation: FunctionalAnnotation { region_type: RegionType::from_str_lossy(region_type), consequence: Consequence::from_str_lossy(consequence), cadd_phred: cadd, revel, + metasvm_pred: MetaSvmPred::Unknown, weights: AnnotationWeights([0.0; 11]), regulatory: RegulatoryFlags { cage_promoter: cage_prom, @@ -681,7 +684,8 @@ mod tests { fn var_at(pos: u32) -> AnnotatedVariant { use crate::types::{ - AnnotationWeights, Chromosome, FunctionalAnnotation, RegionType, RegulatoryFlags, + AnnotationWeights, Chromosome, FunctionalAnnotation, MetaSvmPred, RegionType, + RegulatoryFlags, }; AnnotatedVariant { chromosome: Chromosome::Autosome(1), @@ -690,11 +694,13 @@ mod tests { alt_allele: "T".into(), maf: 0.005, gene_name: "".into(), + genehancer: "".into(), annotation: FunctionalAnnotation { region_type: RegionType::Unknown, consequence: Consequence::Unknown, cadd_phred: 10.0, revel: 0.0, + metasvm_pred: MetaSvmPred::Unknown, weights: AnnotationWeights([0.5; 11]), regulatory: RegulatoryFlags::default(), }, diff --git a/src/staar/meta.rs b/src/staar/meta.rs index 4878485..c187c47 100644 --- a/src/staar/meta.rs +++ b/src/staar/meta.rs @@ -46,7 +46,7 @@ use crate::store::cohort::{ChromosomeView, CohortId}; use crate::store::list::parquet_column_names; use crate::types::{ AnnotatedVariant, AnnotationWeights, Chromosome, Consequence, FunctionalAnnotation, - MetaVariant, RegionType, RegulatoryFlags, + MetaSvmPred, MetaVariant, RegionType, RegulatoryFlags, }; const META_SUMSTATS_COLUMNS: &[ColumnRequirement] = &[ @@ -399,6 +399,8 @@ pub fn merge_chromosome( first_value({csq}) FILTER (WHERE {csq} != '') AS {csq}, \ first_value(cadd_phred_raw) AS cadd_phred_raw, \ first_value({revel}) AS {revel}, \ + first_value({msp}) FILTER (WHERE {msp} != '') AS {msp}, \ + first_value({gh}) FILTER (WHERE {gh} != '') AS {gh}, \ bool_or({cage_p}) AS {cage_p}, \ bool_or({cage_e}) AS {cage_e}, \ bool_or({ccre_p}) AS {ccre_p}, \ @@ -415,6 +417,7 @@ pub fn merge_chromosome( maf = Col::Maf, gene = Col::GeneName, region = Col::RegionType, csq = Col::Consequence, revel = Col::Revel, + msp = Col::MetaSvmPred, gh = Col::GeneHancer, cage_p = Col::IsCagePromoter, cage_e = Col::IsCageEnhancer, ccre_p = Col::IsCcrePromoter, ccre_e = Col::IsCcreEnhancer, ))?; @@ -422,7 +425,7 @@ pub fn merge_chromosome( let batches = engine.collect(&format!( "SELECT {pos}, {ref_a}, {alt_a}, u_meta, \ mac_total, n_total, {gene}, {region}, {csq}, \ - cadd_phred_raw, {revel}, \ + cadd_phred_raw, {revel}, {msp}, {gh}, \ {cage_p}, {cage_e}, {ccre_p}, {ccre_e}, \ {weight_select}, \ study_segs \ @@ -434,6 +437,8 @@ pub fn merge_chromosome( region = Col::RegionType, csq = Col::Consequence, revel = Col::Revel, + msp = Col::MetaSvmPred, + gh = Col::GeneHancer, cage_p = Col::IsCagePromoter, cage_e = Col::IsCageEnhancer, ccre_p = Col::IsCcrePromoter, @@ -492,19 +497,21 @@ pub fn merge_chromosome( let csq_arr = str_col(8)?; let cadd_raw_arr = f64_col(9)?; let revel_arr = f64_col(10)?; - let cp_arr = bool_col(11)?; - let ce_arr = bool_col(12)?; - let crp_arr = bool_col(13)?; - let cre_arr = bool_col(14)?; + let msp_arr = str_col(11)?; + let gh_arr = str_col(12)?; + let cp_arr = bool_col(13)?; + let ce_arr = bool_col(14)?; + let crp_arr = bool_col(15)?; + let cre_arr = bool_col(16)?; // Sumstats parquet stores transformed [0,1] weights under the // FAVOR-native channel names; pass them through so the // PHRED -> probability transform is not applied a second time. let mut w_arrs: Vec<&Float64Array> = Vec::with_capacity(11); for i in 0..11 { - w_arrs.push(f64_col(15 + i)?); + w_arrs.push(f64_col(17 + i)?); } - let segs_arr = str_col(26)?; + let segs_arr = str_col(28)?; for i in 0..n { let mut weights = [0.0f64; 11]; @@ -532,11 +539,13 @@ pub fn merge_chromosome( alt_allele: str_or(alt_arr, i, "").into(), maf, gene_name: str_or(gene_arr, i, "").into(), + genehancer: str_or(gh_arr, i, "").into(), annotation: FunctionalAnnotation { region_type: RegionType::from_str_lossy(str_or(rt_arr, i, "")), consequence: Consequence::from_str_lossy(str_or(csq_arr, i, "")), cadd_phred, revel: f64_or(revel_arr, i, 0.0), + metasvm_pred: MetaSvmPred::from_str_lossy(str_or(msp_arr, i, "")), regulatory: RegulatoryFlags { cage_promoter: bool_or(cp_arr, i, false), cage_enhancer: bool_or(ce_arr, i, false), @@ -899,6 +908,8 @@ fn variant_schema() -> Schema { Field::new(Col::Consequence.as_str(), DataType::Utf8, false), Field::new("cadd_phred_raw", DataType::Float64, false), Field::new(Col::Revel.as_str(), DataType::Float64, false), + Field::new(Col::MetaSvmPred.as_str(), DataType::Utf8, false), + Field::new(Col::GeneHancer.as_str(), DataType::Utf8, false), Field::new(Col::IsCagePromoter.as_str(), DataType::Boolean, false), Field::new(Col::IsCageEnhancer.as_str(), DataType::Boolean, false), Field::new(Col::IsCcrePromoter.as_str(), DataType::Boolean, false), @@ -964,6 +975,8 @@ struct SumstatsWriter { consequence: StringBuilder, cadd: Float64Builder, revel: Float64Builder, + metasvm_pred: StringBuilder, + genehancer: StringBuilder, cage_prom: BooleanBuilder, cage_enh: BooleanBuilder, ccre_prom: BooleanBuilder, @@ -996,6 +1009,8 @@ impl SumstatsWriter { consequence: StringBuilder::with_capacity(total_variants, total_variants * 8), cadd: Float64Builder::with_capacity(total_variants), revel: Float64Builder::with_capacity(total_variants), + metasvm_pred: StringBuilder::with_capacity(total_variants, total_variants), + genehancer: StringBuilder::with_capacity(total_variants, total_variants * 8), cage_prom: BooleanBuilder::with_capacity(total_variants), cage_enh: BooleanBuilder::with_capacity(total_variants), ccre_prom: BooleanBuilder::with_capacity(total_variants), @@ -1043,6 +1058,9 @@ impl SumstatsWriter { .append_value(v.annotation.consequence.as_str()); self.cadd.append_value(v.annotation.cadd_phred); self.revel.append_value(v.annotation.revel); + self.metasvm_pred + .append_value(v.annotation.metasvm_pred.as_str()); + self.genehancer.append_value(&v.genehancer); self.cage_prom .append_value(v.annotation.regulatory.cage_promoter); self.cage_enh @@ -1111,6 +1129,8 @@ impl SumstatsWriter { Arc::new(self.consequence.finish()), Arc::new(self.cadd.finish()), Arc::new(self.revel.finish()), + Arc::new(self.metasvm_pred.finish()), + Arc::new(self.genehancer.finish()), Arc::new(self.cage_prom.finish()), Arc::new(self.cage_enh.finish()), Arc::new(self.ccre_prom.finish()), @@ -1458,11 +1478,13 @@ mod tests { alt_allele: alt_b.into(), maf, gene_name: "GENE".into(), + genehancer: "".into(), annotation: FunctionalAnnotation { region_type: RegionType::Exonic, consequence: Consequence::NonsynonymousSNV, cadd_phred: 20.0, revel: 0.5, + metasvm_pred: MetaSvmPred::Unknown, regulatory: RegulatoryFlags::default(), weights: AnnotationWeights([1.0; 11]), }, diff --git a/src/staar/pipeline.rs b/src/staar/pipeline.rs index 3117a6a..fbedcc0 100644 --- a/src/staar/pipeline.rs +++ b/src/staar/pipeline.rs @@ -454,6 +454,7 @@ impl<'a> StaarPipeline<'a> { let weight_cols: Vec = STAAR_PHRED_CHANNELS.to_vec(); ann_vs.supports(&weight_cols)?; ann_vs.require_staar_weight_catalog()?; + ann_vs.require_structural_annotation_catalog()?; // Raw annotation column contract. let contract = ColumnContract { diff --git a/src/store/cohort/builder.rs b/src/store/cohort/builder.rs index 1227dd6..1099f66 100644 --- a/src/store/cohort/builder.rs +++ b/src/store/cohort/builder.rs @@ -402,6 +402,8 @@ fn write_variants_parquet( region_type: Box, consequence: Box, revel: f64, + metasvm_pred: Box, + genehancer: Box, cage_prom: bool, cage_enh: bool, ccre_prom: bool, @@ -419,6 +421,8 @@ fn write_variants_parquet( let rt_arr = str_col_by_name(batch, Col::RegionType.as_str())?; let csq_arr = str_col_by_name(batch, Col::Consequence.as_str())?; let revel_arr = col_by_name::(batch, Col::Revel.as_str())?; + let msp_arr = str_col_by_name(batch, Col::MetaSvmPred.as_str())?; + let gh_arr = str_col_by_name(batch, Col::GeneHancer.as_str())?; let cps = col_by_name::(batch, Col::IsCagePromoter.as_str())?; let ces = col_by_name::(batch, Col::IsCageEnhancer.as_str())?; let crps = col_by_name::(batch, Col::IsCcrePromoter.as_str())?; @@ -449,6 +453,8 @@ fn write_variants_parquet( region_type: rt_arr.value(i).into(), consequence: csq_arr.value(i).into(), revel: revel_arr.value(i), + metasvm_pred: msp_arr.value(i).into(), + genehancer: gh_arr.value(i).into(), cage_prom: cps.value(i), cage_enh: ces.value(i), ccre_prom: crps.value(i), @@ -470,6 +476,8 @@ fn write_variants_parquet( let mut rt_b = StringBuilder::with_capacity(n, n * 16); let mut csq_b = StringBuilder::with_capacity(n, n * 16); let mut revel_b = Float64Builder::with_capacity(n); + let mut msp_b = StringBuilder::with_capacity(n, n); + let mut gh_b = StringBuilder::with_capacity(n, n * 8); let mut cp_b = BooleanBuilder::with_capacity(n); let mut ce_b = BooleanBuilder::with_capacity(n); let mut crp_b = BooleanBuilder::with_capacity(n); @@ -498,6 +506,8 @@ fn write_variants_parquet( rt_b.append_value(&*m.region_type); csq_b.append_value(&*m.consequence); revel_b.append_value(m.revel); + msp_b.append_value(&*m.metasvm_pred); + gh_b.append_value(&*m.genehancer); cp_b.append_value(m.cage_prom); ce_b.append_value(m.cage_enh); crp_b.append_value(m.ccre_prom); @@ -511,6 +521,8 @@ fn write_variants_parquet( rt_b.append_value(""); csq_b.append_value(""); revel_b.append_value(0.0); + msp_b.append_value(""); + gh_b.append_value(""); cp_b.append_value(false); ce_b.append_value(false); crp_b.append_value(false); @@ -535,6 +547,8 @@ fn write_variants_parquet( Field::new(Col::RegionType.as_str(), DataType::Utf8, false), Field::new(Col::Consequence.as_str(), DataType::Utf8, false), Field::new(Col::Revel.as_str(), DataType::Float64, false), + Field::new(Col::MetaSvmPred.as_str(), DataType::Utf8, false), + Field::new(Col::GeneHancer.as_str(), DataType::Utf8, false), Field::new(Col::IsCagePromoter.as_str(), DataType::Boolean, false), Field::new(Col::IsCageEnhancer.as_str(), DataType::Boolean, false), Field::new(Col::IsCcrePromoter.as_str(), DataType::Boolean, false), @@ -556,6 +570,8 @@ fn write_variants_parquet( Arc::new(rt_b.finish()), Arc::new(csq_b.finish()), Arc::new(revel_b.finish()), + Arc::new(msp_b.finish()), + Arc::new(gh_b.finish()), Arc::new(cp_b.finish()), Arc::new(ce_b.finish()), Arc::new(crp_b.finish()), diff --git a/src/store/cohort/variants.rs b/src/store/cohort/variants.rs index cfb2009..d8417b6 100644 --- a/src/store/cohort/variants.rs +++ b/src/store/cohort/variants.rs @@ -6,8 +6,8 @@ use std::path::Path; use crate::error::CohortError; use crate::store::backend::{Backend, BoxedBatchReader}; use crate::types::{ - AnnotatedVariant, AnnotationWeights, Chromosome, Consequence, FunctionalAnnotation, RegionType, - RegulatoryFlags, + AnnotatedVariant, AnnotationWeights, Chromosome, Consequence, FunctionalAnnotation, MetaSvmPred, + RegionType, RegulatoryFlags, }; #[derive(Clone, Copy, Debug)] @@ -42,6 +42,8 @@ pub struct VariantIndexEntry { pub consequence: Consequence, pub cadd_phred: f64, pub revel: f64, + pub metasvm_pred: MetaSvmPred, + pub genehancer: Box, pub regulatory: RegulatoryFlags, pub weights: AnnotationWeights, } @@ -55,11 +57,13 @@ impl VariantIndexEntry { alt_allele: self.alt_allele.clone(), maf: self.maf, gene_name: "".into(), // gene is a derived property, not stored per-variant + genehancer: self.genehancer.clone(), annotation: FunctionalAnnotation { region_type: self.region_type, consequence: self.consequence, cadd_phred: self.cadd_phred, revel: self.revel, + metasvm_pred: self.metasvm_pred, regulatory: self.regulatory, weights: self.weights, }, @@ -188,6 +192,8 @@ fn load_variant_entries(reader: BoxedBatchReader) -> Result(&batch, Col::RegionType.as_str())?; let csq_arr = col_by_name::(&batch, Col::Consequence.as_str())?; let revel_arr = col_by_name::(&batch, Col::Revel.as_str())?; + let msp_arr = col_by_name::(&batch, Col::MetaSvmPred.as_str())?; + let gh_arr = col_by_name::(&batch, Col::GeneHancer.as_str())?; let cp_arr = col_by_name::(&batch, Col::IsCagePromoter.as_str())?; let ce_arr = col_by_name::(&batch, Col::IsCageEnhancer.as_str())?; let crp_arr = col_by_name::(&batch, Col::IsCcrePromoter.as_str())?; @@ -223,6 +229,8 @@ fn load_variant_entries(reader: BoxedBatchReader) -> Result Result<(), CohortError> { + let have = self.columns(); + let missing: Vec<&str> = crate::column::STAAR_STRUCTURAL_COLS + .iter() + .map(|c| c.as_str()) + .filter(|name| !have.iter().any(|h| h == name)) + .collect(); + if missing.is_empty() { + return Ok(()); + } + Err(CohortError::DataMissing(format!( + "Cohort store at {} is missing structural annotation columns:\n {}\n\ + Required for MetaSVM-driven coding masks and GeneHancer pass-through.\n\ + Rebuild the cohort: `favor ingest --annotations --cohort-id `.", + self.root.display(), + missing.join(", "), + ))) + } } pub struct VariantSetWriter { diff --git a/src/test_fixtures.rs b/src/test_fixtures.rs index 0475a14..e6ad327 100644 --- a/src/test_fixtures.rs +++ b/src/test_fixtures.rs @@ -1,8 +1,8 @@ use std::path::Path; use crate::types::{ - AnnotatedVariant, AnnotationWeights, Chromosome, Consequence, FunctionalAnnotation, RegionType, - RegulatoryFlags, + AnnotatedVariant, AnnotationWeights, Chromosome, Consequence, FunctionalAnnotation, + MetaSvmPred, RegionType, RegulatoryFlags, }; // Raw FAVOR PHRED values for the OR11H1 stopgain on chr22. These go into @@ -33,11 +33,13 @@ pub fn base_variant() -> AnnotatedVariant { alt_allele: "T".into(), maf: 0.0007, gene_name: "OR11H1".into(), + genehancer: "".into(), annotation: FunctionalAnnotation { region_type: RegionType::Exonic, consequence: Consequence::Stopgain, cadd_phred: 23.7, revel: 0.0, + metasvm_pred: MetaSvmPred::Unknown, weights: AnnotationWeights(GROUND_TRUTH_PHREDS), regulatory: RegulatoryFlags::default(), }, @@ -336,7 +338,8 @@ fn annotation_rows_sql() -> String { named_struct('region_type', '{rt}', 'genes', make_array('{gene}'), \ 'consequence', {csq}) AS gencode, \ named_struct('cadd', named_struct('raw', CAST(0.0 AS FLOAT), 'phred', CAST({cadd} AS FLOAT))) AS main, \ - named_struct('revel', {revel}) AS dbnsfp, \ + named_struct('revel', {revel}, 'metasvm_pred', CAST(NULL AS VARCHAR)) AS dbnsfp, \ + named_struct('id', CAST(NULL AS VARCHAR)) AS genehancer, \ CAST({linsight} AS FLOAT) AS linsight, \ CAST({fathmm} AS FLOAT) AS fathmm_xf, \ {cage} AS cage, \ @@ -437,6 +440,8 @@ pub fn staar_rare_sql() -> String { COALESCE(a.gencode.region_type, '') AS {region}, COALESCE(a.gencode.consequence, '') AS {csq}, COALESCE(a.dbnsfp.revel, 0) AS {revel}, + COALESCE(a.dbnsfp.metasvm_pred, '') AS {msp}, + COALESCE(a.genehancer.id, '') AS {gh}, a.cage.cage_promoter IS NOT NULL AS {cage_p}, a.cage.cage_enhancer IS NOT NULL AS {cage_e}, COALESCE(CAST(a.ccre.annotations AS VARCHAR) LIKE '%PLS%', false) AS {ccre_p}, @@ -451,6 +456,7 @@ pub fn staar_rare_sql() -> String { ref_a = Col::RefAllele, alt_a = Col::AltAllele, gene = Col::GeneName, region = Col::RegionType, csq = Col::Consequence, revel = Col::Revel, + msp = Col::MetaSvmPred, gh = Col::GeneHancer, cage_p = Col::IsCagePromoter, cage_e = Col::IsCageEnhancer, ccre_p = Col::IsCcrePromoter, ccre_e = Col::IsCcreEnhancer, ) @@ -509,13 +515,13 @@ mod tests { fn read_extracted_variants(engine: &DfEngine) -> Vec { use crate::column::{Col, STAAR_PHRED_CHANNELS}; - // _rare table columns, in the order the SELECT below fixes: + // _rare SELECT layout: // 0: chromosome - // 1-7: position, ref, alt, maf, gene, region_type, consequence - // 8: revel - // 9-12: cage_prom, cage_enh, ccre_prom, ccre_enh - // 13-23: 11 FAVOR-native PHRED channels (cadd_phred first) - // The PHRED -> [0,1] transform matches VariantIndex::load(). + // 1-8: position, ref, alt, maf, gene, region_type, consequence, revel + // 9-10: metasvm_pred, genehancer + // 11-14: cage_prom, cage_enh, ccre_prom, ccre_enh + // 15-25: 11 FAVOR-native PHRED channels (cadd_phred first) + // PHRED -> [0,1] transform matches VariantIndex::load(). let phred_cols = STAAR_PHRED_CHANNELS .iter() .map(|c| c.as_str()) @@ -524,7 +530,7 @@ mod tests { let batches = engine .collect(&format!( "SELECT {chr}, {pos}, {ref_a}, {alt_a}, {maf}, \ - {gene}, {region}, {csq}, {revel}, \ + {gene}, {region}, {csq}, {revel}, {msp}, {gh}, \ {cage_p}, {cage_e}, {ccre_p}, {ccre_e}, \ {phred_cols} \ FROM _rare ORDER BY {pos}", @@ -537,6 +543,8 @@ mod tests { region = Col::RegionType, csq = Col::Consequence, revel = Col::Revel, + msp = Col::MetaSvmPred, + gh = Col::GeneHancer, cage_p = Col::IsCagePromoter, cage_e = Col::IsCageEnhancer, ccre_p = Col::IsCcrePromoter, @@ -549,14 +557,14 @@ mod tests { for row in 0..batch.num_rows() { let mut weights = [0.0f64; 11]; for (i, w) in weights.iter_mut().enumerate() { - let phred = num_col_val(batch.column(13 + i).as_ref(), row); + let phred = num_col_val(batch.column(15 + i).as_ref(), row); *w = if phred > 0.0 && phred.is_finite() { 1.0 - 10f64.powf(-phred / 10.0) } else { 0.0 }; } - let cadd_phred = num_col_val(batch.column(13).as_ref(), row); + let cadd_phred = num_col_val(batch.column(15).as_ref(), row); variants.push(AnnotatedVariant { chromosome: str_col_val(batch.column(0).as_ref(), row) @@ -567,6 +575,7 @@ mod tests { alt_allele: str_col_val(batch.column(3).as_ref(), row).into(), maf: num_col_val(batch.column(4).as_ref(), row), gene_name: str_col_val(batch.column(5).as_ref(), row).into(), + genehancer: str_col_val(batch.column(10).as_ref(), row).into(), annotation: FunctionalAnnotation { region_type: RegionType::from_str_lossy(&str_col_val( batch.column(6).as_ref(), @@ -578,11 +587,15 @@ mod tests { )), cadd_phred, revel: num_col_val(batch.column(8).as_ref(), row), + metasvm_pred: MetaSvmPred::from_str_lossy(&str_col_val( + batch.column(9).as_ref(), + row, + )), regulatory: RegulatoryFlags { - cage_promoter: bool_col_val(batch.column(9).as_ref(), row), - cage_enhancer: bool_col_val(batch.column(10).as_ref(), row), - ccre_promoter: bool_col_val(batch.column(11).as_ref(), row), - ccre_enhancer: bool_col_val(batch.column(12).as_ref(), row), + cage_promoter: bool_col_val(batch.column(11).as_ref(), row), + cage_enhancer: bool_col_val(batch.column(12).as_ref(), row), + ccre_promoter: bool_col_val(batch.column(13).as_ref(), row), + ccre_enhancer: bool_col_val(batch.column(14).as_ref(), row), }, weights: AnnotationWeights(weights), }, diff --git a/src/types.rs b/src/types.rs index e0177c7..84e89c9 100644 --- a/src/types.rs +++ b/src/types.rs @@ -421,6 +421,35 @@ pub struct RegulatoryFlags { pub ccre_enhancer: bool, } +/// dbNSFP MetaSVM ensemble missense prediction. "D" (Deleterious) drives +/// STAARpipeline's disruptive_missense / plof_ds / ptv_ds predicates; "T" +/// (Tolerated) and missing (".") collapse to Unknown. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)] +#[repr(u8)] +pub enum MetaSvmPred { + #[default] + Unknown, + Deleterious, + Tolerated, +} + +impl MetaSvmPred { + pub fn as_str(self) -> &'static str { + match self { + Self::Deleterious => "D", + Self::Tolerated => "T", + Self::Unknown => "", + } + } + pub fn from_str_lossy(s: &str) -> Self { + match s { + "D" => Self::Deleterious, + "T" => Self::Tolerated, + _ => Self::Unknown, + } + } +} + /// Everything STAAR needs to classify (masks) and weight (score tests) a variant. /// /// Mask predicates access `consequence` / `region_type`. @@ -431,6 +460,7 @@ pub struct FunctionalAnnotation { pub consequence: Consequence, pub cadd_phred: f64, pub revel: f64, + pub metasvm_pred: MetaSvmPred, pub regulatory: RegulatoryFlags, pub weights: AnnotationWeights, } @@ -447,6 +477,9 @@ pub struct AnnotatedVariant { pub alt_allele: Box, pub maf: f64, pub gene_name: Box, + /// Raw GeneHancer identifier string from FAVOR (`a.genehancer.id`). + /// Pass-through to sumstats; no mask predicate reads it today. + pub genehancer: Box, pub annotation: FunctionalAnnotation, }