Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 24 additions & 2 deletions src/column.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ pub enum Col {

CaddPhred,
Revel,
MetaSvmPred,
GeneHancer,

Vid,
VariantVcf,
Expand Down Expand Up @@ -52,6 +54,8 @@ impl Col {
Col::Consequence => "consequence",
Col::CaddPhred => "cadd_phred",
Col::Revel => "revel",
Col::MetaSvmPred => "metasvm_pred",
Col::GeneHancer => "genehancer",
Col::IsCagePromoter => "is_cage_promoter",
Col::IsCageEnhancer => "is_cage_enhancer",
Col::IsCcrePromoter => "is_ccre_promoter",
Expand Down Expand Up @@ -152,7 +156,7 @@ pub const STAAR_PHRED_SOURCES: [(&str, &str); 11] = [
),
];

pub const STORE_METADATA_COLS: [Col; 13] = [
pub const STORE_METADATA_COLS: [Col; 15] = [
Col::Position,
Col::EndPosition,
Col::RefAllele,
Expand All @@ -162,12 +166,20 @@ pub const STORE_METADATA_COLS: [Col; 13] = [
Col::RegionType,
Col::Consequence,
Col::Revel,
Col::MetaSvmPred,
Col::GeneHancer,
Col::IsCagePromoter,
Col::IsCageEnhancer,
Col::IsCcrePromoter,
Col::IsCcreEnhancer,
];

/// Structural annotation columns beyond the 11 weight channels. Required
/// for mask predicates that match STAARpipeline (disruptive_missense,
/// plof_ds, ptv_ds via MetaSVM; GeneHancer pass-through for downstream
/// tooling). Locked at cohort preflight.
pub const STAAR_STRUCTURAL_COLS: [Col; 2] = [Col::MetaSvmPred, Col::GeneHancer];

pub fn store_columns() -> Vec<Col> {
let mut cols = STORE_METADATA_COLS.to_vec();
cols.extend_from_slice(&STAAR_PHRED_CHANNELS);
Expand Down Expand Up @@ -213,6 +225,14 @@ pub static ANNOTATION_EXTRACTS: &[AnnotationExtract] = &[
output: Col::Revel,
sql: "CAST(COALESCE(a.dbnsfp.revel, 0.0) AS DOUBLE)",
},
AnnotationExtract {
output: Col::MetaSvmPred,
sql: "COALESCE(a.dbnsfp.metasvm_pred, '')",
},
AnnotationExtract {
output: Col::GeneHancer,
sql: "COALESCE(a.genehancer.id, '')",
},
AnnotationExtract {
output: Col::IsCagePromoter,
sql: "a.cage.cage_promoter IS NOT NULL",
Expand Down Expand Up @@ -473,6 +493,8 @@ pub static VARIANT_STORE_CONTRACT: SchemaContract = SchemaContract {
(Col::Consequence, ColType::Utf8),
(Col::CaddPhred, ColType::Float64),
(Col::Revel, ColType::Float64),
(Col::MetaSvmPred, ColType::Utf8),
(Col::GeneHancer, ColType::Utf8),
(Col::IsCagePromoter, ColType::Boolean),
(Col::IsCageEnhancer, ColType::Boolean),
(Col::IsCcrePromoter, ColType::Boolean),
Expand Down Expand Up @@ -550,7 +572,7 @@ mod tests {
#[test]
fn store_columns_complete() {
let cols = store_columns();
assert_eq!(cols.len(), 24); // 13 metadata + 11 phred channels
assert_eq!(cols.len(), 26); // 15 metadata + 11 phred channels
}

#[test]
Expand Down
10 changes: 8 additions & 2 deletions src/staar/masks.rs
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,8 @@ fn v(
ccre_enh: bool,
) -> AnnotatedVariant {
use crate::types::{
AnnotationWeights, Chromosome, FunctionalAnnotation, RegionType, RegulatoryFlags,
AnnotationWeights, Chromosome, FunctionalAnnotation, MetaSvmPred, RegionType,
RegulatoryFlags,
};
AnnotatedVariant {
chromosome: Chromosome::Autosome(22),
Expand All @@ -306,11 +307,13 @@ fn v(
alt_allele: "T".into(),
maf: 0.001,
gene_name: "BRCA2".into(),
genehancer: "".into(),
annotation: FunctionalAnnotation {
region_type: RegionType::from_str_lossy(region_type),
consequence: Consequence::from_str_lossy(consequence),
cadd_phred: cadd,
revel,
metasvm_pred: MetaSvmPred::Unknown,
weights: AnnotationWeights([0.0; 11]),
regulatory: RegulatoryFlags {
cage_promoter: cage_prom,
Expand Down Expand Up @@ -681,7 +684,8 @@ mod tests {

fn var_at(pos: u32) -> AnnotatedVariant {
use crate::types::{
AnnotationWeights, Chromosome, FunctionalAnnotation, RegionType, RegulatoryFlags,
AnnotationWeights, Chromosome, FunctionalAnnotation, MetaSvmPred, RegionType,
RegulatoryFlags,
};
AnnotatedVariant {
chromosome: Chromosome::Autosome(1),
Expand All @@ -690,11 +694,13 @@ mod tests {
alt_allele: "T".into(),
maf: 0.005,
gene_name: "".into(),
genehancer: "".into(),
annotation: FunctionalAnnotation {
region_type: RegionType::Unknown,
consequence: Consequence::Unknown,
cadd_phred: 10.0,
revel: 0.0,
metasvm_pred: MetaSvmPred::Unknown,
weights: AnnotationWeights([0.5; 11]),
regulatory: RegulatoryFlags::default(),
},
Expand Down
38 changes: 30 additions & 8 deletions src/staar/meta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ use crate::store::cohort::{ChromosomeView, CohortId};
use crate::store::list::parquet_column_names;
use crate::types::{
AnnotatedVariant, AnnotationWeights, Chromosome, Consequence, FunctionalAnnotation,
MetaVariant, RegionType, RegulatoryFlags,
MetaSvmPred, MetaVariant, RegionType, RegulatoryFlags,
};

const META_SUMSTATS_COLUMNS: &[ColumnRequirement] = &[
Expand Down Expand Up @@ -399,6 +399,8 @@ pub fn merge_chromosome(
first_value({csq}) FILTER (WHERE {csq} != '') AS {csq}, \
first_value(cadd_phred_raw) AS cadd_phred_raw, \
first_value({revel}) AS {revel}, \
first_value({msp}) FILTER (WHERE {msp} != '') AS {msp}, \
first_value({gh}) FILTER (WHERE {gh} != '') AS {gh}, \
bool_or({cage_p}) AS {cage_p}, \
bool_or({cage_e}) AS {cage_e}, \
bool_or({ccre_p}) AS {ccre_p}, \
Expand All @@ -415,14 +417,15 @@ pub fn merge_chromosome(
maf = Col::Maf,
gene = Col::GeneName, region = Col::RegionType, csq = Col::Consequence,
revel = Col::Revel,
msp = Col::MetaSvmPred, gh = Col::GeneHancer,
cage_p = Col::IsCagePromoter, cage_e = Col::IsCageEnhancer,
ccre_p = Col::IsCcrePromoter, ccre_e = Col::IsCcreEnhancer,
))?;

let batches = engine.collect(&format!(
"SELECT {pos}, {ref_a}, {alt_a}, u_meta, \
mac_total, n_total, {gene}, {region}, {csq}, \
cadd_phred_raw, {revel}, \
cadd_phred_raw, {revel}, {msp}, {gh}, \
{cage_p}, {cage_e}, {ccre_p}, {ccre_e}, \
{weight_select}, \
study_segs \
Expand All @@ -434,6 +437,8 @@ pub fn merge_chromosome(
region = Col::RegionType,
csq = Col::Consequence,
revel = Col::Revel,
msp = Col::MetaSvmPred,
gh = Col::GeneHancer,
cage_p = Col::IsCagePromoter,
cage_e = Col::IsCageEnhancer,
ccre_p = Col::IsCcrePromoter,
Expand Down Expand Up @@ -492,19 +497,21 @@ pub fn merge_chromosome(
let csq_arr = str_col(8)?;
let cadd_raw_arr = f64_col(9)?;
let revel_arr = f64_col(10)?;
let cp_arr = bool_col(11)?;
let ce_arr = bool_col(12)?;
let crp_arr = bool_col(13)?;
let cre_arr = bool_col(14)?;
let msp_arr = str_col(11)?;
let gh_arr = str_col(12)?;
let cp_arr = bool_col(13)?;
let ce_arr = bool_col(14)?;
let crp_arr = bool_col(15)?;
let cre_arr = bool_col(16)?;

// Sumstats parquet stores transformed [0,1] weights under the
// FAVOR-native channel names; pass them through so the
// PHRED -> probability transform is not applied a second time.
let mut w_arrs: Vec<&Float64Array> = Vec::with_capacity(11);
for i in 0..11 {
w_arrs.push(f64_col(15 + i)?);
w_arrs.push(f64_col(17 + i)?);
}
let segs_arr = str_col(26)?;
let segs_arr = str_col(28)?;

for i in 0..n {
let mut weights = [0.0f64; 11];
Expand Down Expand Up @@ -532,11 +539,13 @@ pub fn merge_chromosome(
alt_allele: str_or(alt_arr, i, "").into(),
maf,
gene_name: str_or(gene_arr, i, "").into(),
genehancer: str_or(gh_arr, i, "").into(),
annotation: FunctionalAnnotation {
region_type: RegionType::from_str_lossy(str_or(rt_arr, i, "")),
consequence: Consequence::from_str_lossy(str_or(csq_arr, i, "")),
cadd_phred,
revel: f64_or(revel_arr, i, 0.0),
metasvm_pred: MetaSvmPred::from_str_lossy(str_or(msp_arr, i, "")),
regulatory: RegulatoryFlags {
cage_promoter: bool_or(cp_arr, i, false),
cage_enhancer: bool_or(ce_arr, i, false),
Expand Down Expand Up @@ -899,6 +908,8 @@ fn variant_schema() -> Schema {
Field::new(Col::Consequence.as_str(), DataType::Utf8, false),
Field::new("cadd_phred_raw", DataType::Float64, false),
Field::new(Col::Revel.as_str(), DataType::Float64, false),
Field::new(Col::MetaSvmPred.as_str(), DataType::Utf8, false),
Field::new(Col::GeneHancer.as_str(), DataType::Utf8, false),
Field::new(Col::IsCagePromoter.as_str(), DataType::Boolean, false),
Field::new(Col::IsCageEnhancer.as_str(), DataType::Boolean, false),
Field::new(Col::IsCcrePromoter.as_str(), DataType::Boolean, false),
Expand Down Expand Up @@ -964,6 +975,8 @@ struct SumstatsWriter {
consequence: StringBuilder,
cadd: Float64Builder,
revel: Float64Builder,
metasvm_pred: StringBuilder,
genehancer: StringBuilder,
cage_prom: BooleanBuilder,
cage_enh: BooleanBuilder,
ccre_prom: BooleanBuilder,
Expand Down Expand Up @@ -996,6 +1009,8 @@ impl SumstatsWriter {
consequence: StringBuilder::with_capacity(total_variants, total_variants * 8),
cadd: Float64Builder::with_capacity(total_variants),
revel: Float64Builder::with_capacity(total_variants),
metasvm_pred: StringBuilder::with_capacity(total_variants, total_variants),
genehancer: StringBuilder::with_capacity(total_variants, total_variants * 8),
cage_prom: BooleanBuilder::with_capacity(total_variants),
cage_enh: BooleanBuilder::with_capacity(total_variants),
ccre_prom: BooleanBuilder::with_capacity(total_variants),
Expand Down Expand Up @@ -1043,6 +1058,9 @@ impl SumstatsWriter {
.append_value(v.annotation.consequence.as_str());
self.cadd.append_value(v.annotation.cadd_phred);
self.revel.append_value(v.annotation.revel);
self.metasvm_pred
.append_value(v.annotation.metasvm_pred.as_str());
self.genehancer.append_value(&v.genehancer);
self.cage_prom
.append_value(v.annotation.regulatory.cage_promoter);
self.cage_enh
Expand Down Expand Up @@ -1111,6 +1129,8 @@ impl SumstatsWriter {
Arc::new(self.consequence.finish()),
Arc::new(self.cadd.finish()),
Arc::new(self.revel.finish()),
Arc::new(self.metasvm_pred.finish()),
Arc::new(self.genehancer.finish()),
Arc::new(self.cage_prom.finish()),
Arc::new(self.cage_enh.finish()),
Arc::new(self.ccre_prom.finish()),
Expand Down Expand Up @@ -1458,11 +1478,13 @@ mod tests {
alt_allele: alt_b.into(),
maf,
gene_name: "GENE".into(),
genehancer: "".into(),
annotation: FunctionalAnnotation {
region_type: RegionType::Exonic,
consequence: Consequence::NonsynonymousSNV,
cadd_phred: 20.0,
revel: 0.5,
metasvm_pred: MetaSvmPred::Unknown,
regulatory: RegulatoryFlags::default(),
weights: AnnotationWeights([1.0; 11]),
},
Expand Down
1 change: 1 addition & 0 deletions src/staar/pipeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,7 @@ impl<'a> StaarPipeline<'a> {
let weight_cols: Vec<crate::column::Col> = STAAR_PHRED_CHANNELS.to_vec();
ann_vs.supports(&weight_cols)?;
ann_vs.require_staar_weight_catalog()?;
ann_vs.require_structural_annotation_catalog()?;

// Raw annotation column contract.
let contract = ColumnContract {
Expand Down
16 changes: 16 additions & 0 deletions src/store/cohort/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,8 @@ fn write_variants_parquet(
region_type: Box<str>,
consequence: Box<str>,
revel: f64,
metasvm_pred: Box<str>,
genehancer: Box<str>,
cage_prom: bool,
cage_enh: bool,
ccre_prom: bool,
Expand All @@ -419,6 +421,8 @@ fn write_variants_parquet(
let rt_arr = str_col_by_name(batch, Col::RegionType.as_str())?;
let csq_arr = str_col_by_name(batch, Col::Consequence.as_str())?;
let revel_arr = col_by_name::<Float64Array>(batch, Col::Revel.as_str())?;
let msp_arr = str_col_by_name(batch, Col::MetaSvmPred.as_str())?;
let gh_arr = str_col_by_name(batch, Col::GeneHancer.as_str())?;
let cps = col_by_name::<BooleanArray>(batch, Col::IsCagePromoter.as_str())?;
let ces = col_by_name::<BooleanArray>(batch, Col::IsCageEnhancer.as_str())?;
let crps = col_by_name::<BooleanArray>(batch, Col::IsCcrePromoter.as_str())?;
Expand Down Expand Up @@ -449,6 +453,8 @@ fn write_variants_parquet(
region_type: rt_arr.value(i).into(),
consequence: csq_arr.value(i).into(),
revel: revel_arr.value(i),
metasvm_pred: msp_arr.value(i).into(),
genehancer: gh_arr.value(i).into(),
cage_prom: cps.value(i),
cage_enh: ces.value(i),
ccre_prom: crps.value(i),
Expand All @@ -470,6 +476,8 @@ fn write_variants_parquet(
let mut rt_b = StringBuilder::with_capacity(n, n * 16);
let mut csq_b = StringBuilder::with_capacity(n, n * 16);
let mut revel_b = Float64Builder::with_capacity(n);
let mut msp_b = StringBuilder::with_capacity(n, n);
let mut gh_b = StringBuilder::with_capacity(n, n * 8);
let mut cp_b = BooleanBuilder::with_capacity(n);
let mut ce_b = BooleanBuilder::with_capacity(n);
let mut crp_b = BooleanBuilder::with_capacity(n);
Expand Down Expand Up @@ -498,6 +506,8 @@ fn write_variants_parquet(
rt_b.append_value(&*m.region_type);
csq_b.append_value(&*m.consequence);
revel_b.append_value(m.revel);
msp_b.append_value(&*m.metasvm_pred);
gh_b.append_value(&*m.genehancer);
cp_b.append_value(m.cage_prom);
ce_b.append_value(m.cage_enh);
crp_b.append_value(m.ccre_prom);
Expand All @@ -511,6 +521,8 @@ fn write_variants_parquet(
rt_b.append_value("");
csq_b.append_value("");
revel_b.append_value(0.0);
msp_b.append_value("");
gh_b.append_value("");
cp_b.append_value(false);
ce_b.append_value(false);
crp_b.append_value(false);
Expand All @@ -535,6 +547,8 @@ fn write_variants_parquet(
Field::new(Col::RegionType.as_str(), DataType::Utf8, false),
Field::new(Col::Consequence.as_str(), DataType::Utf8, false),
Field::new(Col::Revel.as_str(), DataType::Float64, false),
Field::new(Col::MetaSvmPred.as_str(), DataType::Utf8, false),
Field::new(Col::GeneHancer.as_str(), DataType::Utf8, false),
Field::new(Col::IsCagePromoter.as_str(), DataType::Boolean, false),
Field::new(Col::IsCageEnhancer.as_str(), DataType::Boolean, false),
Field::new(Col::IsCcrePromoter.as_str(), DataType::Boolean, false),
Expand All @@ -556,6 +570,8 @@ fn write_variants_parquet(
Arc::new(rt_b.finish()),
Arc::new(csq_b.finish()),
Arc::new(revel_b.finish()),
Arc::new(msp_b.finish()),
Arc::new(gh_b.finish()),
Arc::new(cp_b.finish()),
Arc::new(ce_b.finish()),
Arc::new(crp_b.finish()),
Expand Down
Loading
Loading