Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 61 additions & 7 deletions src/staar/genotype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -236,15 +236,27 @@ impl GenotypeWriter {
let mut ac: f64 = 0.0;
let mut an: f64 = 0.0;

if !samples_str.is_empty() && samples_str != "." {
let bytes = samples_str.as_bytes();
// noodles exposes `Samples` as everything after INFO, which in VCF
// includes the FORMAT column (e.g. "GT" or "GT:AD:DP"). Skip past
// the first tab so the memchr loop sees only per-sample fields.
let body = if samples_str.is_empty() || samples_str == "." {
""
} else {
match memchr::memchr(b'\t', samples_str.as_bytes()) {
Some(tab) => &samples_str[tab + 1..],
None => "",
}
};

if !body.is_empty() {
let bytes = body.as_bytes();
let n = self.n_samples;
let mut sample_idx: usize = 0;
let mut start: usize = 0;

// memchr uses AVX2/SSE4.2 to find tabs at 32 bytes/cycle.
// For 200K-sample lines (~7MB), this replaces the dominant cost
// of byte-by-byte split('\t').
// memchr uses AVX2/SSE4.2 to find tabs at 32 bytes/cycle. For
// 200K-sample lines (~7MB), this replaces the dominant cost of
// byte-by-byte split('\t').
for tab_pos in memchr::memchr_iter(b'\t', bytes) {
if sample_idx >= n { break; }
let gt_len = gt_prefix_len(&bytes[start..tab_pos]);
Expand Down Expand Up @@ -680,8 +692,7 @@ mod tests {
let out = SilentOutput;
let caught = panic::catch_unwind(AssertUnwindSafe(|| {
let mut gw = GenotypeWriter::new(2, tmp.path(), 64 << 20).unwrap();
// One biallelic SNV, two samples. GT field = "0/0\t0/1".
gw.push("22", 15_000_000, "A", "G", "0/0\t0/1", 1, &out).unwrap();
gw.push("22", 15_000_000, "A", "G", "GT\t0/0\t0/1", 1, &out).unwrap();
panic!("simulated worker panic before flush_all");
}));
assert!(caught.is_err(), "expected panic to propagate");
Expand All @@ -692,4 +703,47 @@ mod tests {
let reader = parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder::try_new(f);
assert!(reader.is_ok(), "parquet footer missing: {:?}", reader.err());
}

/// noodles hands us raw samples text that starts with the FORMAT column
/// ("GT" or "GT:AD:DP"). `push` must strip it before parsing per-sample
/// genotypes; otherwise sample[0]'s slot gets the FORMAT bytes, everything
/// shifts by one, and the last sample is dropped.
#[test]
fn push_strips_format_before_mapping_samples() {
let tmp = tempfile::tempdir().unwrap();
let out = SilentOutput;
let mut gw = GenotypeWriter::new(3, tmp.path(), 64 << 20).unwrap();
gw.push("22", 100, "A", "G", "GT\t0/0\t0/1\t1/1", 1, &out)
.unwrap();
assert_eq!(gw.dosages, vec![0.0, 1.0, 2.0]);
}

#[test]
fn push_handles_multi_field_format() {
// FORMAT="GT:AD:DP" is one tab-delimited field; sample values carry
// their own colon separators inside each tab cell.
let tmp = tempfile::tempdir().unwrap();
let out = SilentOutput;
let mut gw = GenotypeWriter::new(2, tmp.path(), 64 << 20).unwrap();
gw.push(
"22",
200,
"A",
"G",
"GT:AD:DP\t0/1:12,3:15\t1/1:0,20:20",
1,
&out,
)
.unwrap();
assert_eq!(gw.dosages, vec![1.0, 2.0]);
}

#[test]
fn push_treats_missing_samples_field_as_all_nan() {
let tmp = tempfile::tempdir().unwrap();
let out = SilentOutput;
let mut gw = GenotypeWriter::new(2, tmp.path(), 64 << 20).unwrap();
gw.push("22", 300, "A", "G", ".", 1, &out).unwrap();
assert!(gw.dosages.iter().all(|d| d.is_nan()));
}
}
Loading