Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
c1fa86b
Replace Beta/proportion likelihood with negative binomial approach
devin-ai-integration[bot] May 30, 2025
dacc412
Fix unused variable warning for total_reads parameter
devin-ai-integration[bot] May 30, 2025
da2d5bc
Remove unused total_reads parameter entirely
devin-ai-integration[bot] May 30, 2025
363c916
Apply cargo fmt formatting fixes
devin-ai-integration[bot] May 30, 2025
0cb0e52
Fix posterior probability pipeline to preserve Bayesian calculations
devin-ai-integration[bot] May 30, 2025
c1a0902
Change peak merging to multiply posterior probabilities
devin-ai-integration[bot] May 30, 2025
30f6403
Fix posterior probability bug in identify_significant_bins
devin-ai-integration[bot] May 30, 2025
94befe7
fmt
jakevc May 30, 2025
aac0bdf
Apply cargo fmt to fix CI formatting requirements
devin-ai-integration[bot] May 30, 2025
5692419
Merge branch 'devin/1748581809-negative-binomial-likelihood' of https…
devin-ai-integration[bot] May 30, 2025
dc7606b
Add debug logging to trace posterior probability calculations
devin-ai-integration[bot] May 30, 2025
80e7355
Fix posterior probability calculation with proper signal vs noise hyp…
devin-ai-integration[bot] May 30, 2025
a1382ab
Apply cargo fmt formatting to bayesian.rs
devin-ai-integration[bot] May 30, 2025
f54cc23
Fix formatting issues - remove extra blank lines for CI
devin-ai-integration[bot] May 30, 2025
b1c09e9
Fix clippy warning - remove unnecessary let binding
devin-ai-integration[bot] May 30, 2025
da2a06a
Implement conservative negative binomial parameter estimation
devin-ai-integration[bot] May 30, 2025
34ffeab
unlog
jakevc May 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions src/bam.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,10 +136,6 @@ impl BamProcessor {
.iter()
.cloned()
.zip(bin_counts.iter().cloned())
.map(|(mut bin, count)| {
bin.posterior_prob = 1.0;
(bin, count)
})
.collect();

Ok(result)
Expand Down
177 changes: 113 additions & 64 deletions src/bayesian.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
use bio::stats::bayesian::model::{Likelihood, Model, Posterior, Prior};
use bio::stats::{LogProb, Prob};
use log::info;
use statrs::distribution::{Discrete, NegativeBinomial};
use statrs::statistics::Statistics;

use crate::bam::GenomicRange;
Expand All @@ -18,9 +19,10 @@
}

#[allow(dead_code)]
#[derive(Clone)]
pub struct GenomicPrior {
pub alpha: f64,
pub beta: f64,
pub r: f64, // number of successes parameter
pub p: f64, // success probability parameter
}

impl Prior for GenomicPrior {
Expand All @@ -33,54 +35,74 @@
}

impl GenomicPrior {
pub fn from_bin_counts(bin_counts: &[(GenomicRange, usize)], total_reads: usize) -> Self {
let non_zero_counts: Vec<f64> = bin_counts
.iter()
.filter_map(|(_, count)| {
if *count > 0 {
Some(*count as f64)
} else {
None
}
})
.collect();

let mean = non_zero_counts.clone().mean();
let variance = non_zero_counts.variance();

let total_mean = mean / total_reads as f64;
let total_variance = variance / (total_reads * total_reads) as f64;

if total_variance >= total_mean * (1.0 - total_mean) {
return Self {
alpha: 1.0,
beta: 99.0,
};
}
pub fn from_bin_counts(bin_counts: &[(GenomicRange, usize)]) -> Self {
let counts: Vec<f64> = bin_counts.iter().map(|(_, count)| *count as f64).collect();

let alpha_beta_sum = total_mean * (1.0 - total_mean) / total_variance - 1.0;
let alpha = total_mean * alpha_beta_sum;
let beta = (1.0 - total_mean) * alpha_beta_sum;
if counts.is_empty() || counts.len() < 2 {
return Self { r: 2.0, p: 0.3 };
}

Self {
alpha: alpha.max(0.01),
beta: beta.max(0.01),
let mean = counts.clone().mean();
let variance = counts.clone().variance();

// info!(
// "Count statistics: mean={}, variance={}, n={}",
// mean,
// variance,
// counts.len()
// );

if variance > mean && variance.is_finite() && mean > 0.0 {
let r = (mean * mean) / (variance - mean);
let p = mean / variance;

if r.is_finite() && r > 0.0 && p.is_finite() && p > 0.0 && p < 1.0 {
let final_r = r.clamp(1.0, 50.0); // Tighter bounds: min 1.0, max 50.0
let final_p = p.clamp(0.1, 0.8); // Tighter bounds: min 0.1, max 0.8

// info!(
// "Method-of-moments estimation: raw r={}, p={}, clamped r={}, p={}",
// r, p, final_r, final_p
// );

return Self {
r: final_r,
p: final_p,
};
}
}

Self { r: 3.0, p: 0.4 }
}
}

pub struct GenomicLikelihood;
pub struct GenomicLikelihood {
pub r: f64,
pub p: f64,
}

impl Likelihood for GenomicLikelihood {
type Event = GenomicEvent;
type Data = ReadCountData;

fn compute(&self, event: &Self::Event, data: &Self::Data, _payload: &mut ()) -> LogProb {
let observed_proportion = data.observed_count as f64 / event.total_reads as f64;

let likelihood_given_signal = observed_proportion.max(0.5);

LogProb::from(Prob(likelihood_given_signal))
fn compute(&self, _event: &Self::Event, data: &Self::Data, _payload: &mut ()) -> LogProb {
match NegativeBinomial::new(self.r, self.p) {
Ok(nb_dist) => {
let log_likelihood = nb_dist.ln_pmf(data.observed_count as u64);
// info!(
// "NB likelihood: count={}, r={}, p={}, ln_pmf={}",
// data.observed_count, self.r, self.p, log_likelihood
// );
LogProb::from(log_likelihood)
}
Err(e) => {

Check warning on line 98 in src/bayesian.rs

View workflow job for this annotation

GitHub Actions / test-example

unused variable: `e`

Check warning on line 98 in src/bayesian.rs

View workflow job for this annotation

GitHub Actions / test-example

unused variable: `e`
// info!(
// "NegativeBinomial::new failed: r={}, p={}, error={:?}",
// self.r, self.p, e
// );
LogProb::ln_zero()
}
}
}
}

Expand All @@ -95,16 +117,27 @@
where
F: FnMut(&Self::BaseEvent, &Self::Data) -> LogProb,
{
// Calculate joint probability for signal hypothesis
let joint_prob_signal = joint_prob(event, data);

let noise_event = GenomicEvent {
bin_count: (data.observed_count as f64 * 0.1) as usize,
total_reads: event.total_reads,
// Calculate signal likelihood using current parameters
let signal_likelihood = joint_prob(event, data);

// output is very sensitive to the noise parameters
let noise_likelihood = match NegativeBinomial::new(0.7, 0.5) {
Ok(nb_dist) => {
let log_likelihood = nb_dist.ln_pmf(data.observed_count as u64);
// info!(
// "Noise likelihood: count={}, ln_pmf={}",
// data.observed_count, log_likelihood
// );
LogProb::from(log_likelihood)
}
Err(_) => LogProb::ln_zero(),
};

// Calculate joint probability for noise hypothesis
let joint_prob_noise = joint_prob(&noise_event, data);
let prior_signal = LogProb::from(Prob(0.3)); // Reduced from 0.5 to be more conservative
let prior_noise = LogProb::from(Prob(0.7)); // Increased from 0.5 to be more conservative

let joint_prob_signal = signal_likelihood + prior_signal;
let joint_prob_noise = noise_likelihood + prior_noise;

// Calculate evidence (marginal likelihood)
let evidence = joint_prob_signal.ln_add_exp(joint_prob_noise);
Expand All @@ -121,11 +154,8 @@

impl BayesianModel {
pub fn new(significance_threshold: f64, min_reads: u32) -> Self {
let likelihood = GenomicLikelihood;
let prior = GenomicPrior {
alpha: 1.0,
beta: 99.0,
};
let likelihood = GenomicLikelihood { r: 1.0, p: 0.5 };
let prior = GenomicPrior { r: 1.0, p: 0.5 };
let posterior = GenomicPosterior;
let model = Model::new(likelihood, prior, posterior);

Expand All @@ -151,10 +181,18 @@
) -> Result<Vec<GenomicRange>> {
info!("Applying rust-bio Bayesian model to identify significant bins");

let prior = GenomicPrior::from_bin_counts(bin_counts, total_reads);
*self.model.prior_mut() = prior;
let prior = GenomicPrior::from_bin_counts(bin_counts);
// info!(
// "Estimated negative binomial parameters: r={}, p={} (fallback used: {})",
// prior.r,
// prior.p,
// prior.r == 3.0 && prior.p == 0.4
// );
*self.model.prior_mut() = prior.clone();

self.model.likelihood_mut().r = prior.r;
self.model.likelihood_mut().p = prior.p;

let mut significant_bins = Vec::new();
let mut posterior_probs = Vec::new();

for (bin, count) in bin_counts {
Expand All @@ -171,25 +209,32 @@
observed_count: *count,
};

let universe = vec![event.clone()];
let model_instance = self.model.compute(universe, &data);
let mut joint_prob_fn = |event: &GenomicEvent, data: &ReadCountData| -> LogProb {
let likelihood = self.model.likelihood().compute(event, data, &mut ());
let prior = self.model.prior().compute(event);
likelihood + prior
};

let posterior_log = model_instance
.posterior(&event)
.unwrap_or(LogProb::ln_zero());
let posterior_log = self
.model
.posterior()
.compute(&event, &data, &mut joint_prob_fn);
let posterior_prob = (*Prob::from(posterior_log)).clamp(0.0, 1.0);

// info!(
// "Bin {}:{}-{} count={} posterior_log={:?} posterior_prob={}",
// bin.chrom, bin.start, bin.end, count, posterior_log, posterior_prob
// );
posterior_probs.push((bin.clone(), posterior_prob));
}

let significant =
self.apply_posterior_threshold(posterior_probs, self.significance_threshold)?;

for (bin, _) in significant {
significant_bins.push(bin);
}
let significant_bins: Vec<GenomicRange> =
significant.into_iter().map(|(bin, _)| bin).collect();

info!("Found {} significant bins", significant_bins.len());
// info!("Found {} significant bins", significant_bins.len());

Ok(significant_bins)
}
Expand All @@ -205,6 +250,10 @@
if posterior_prob >= threshold {
let mut bin_clone = bin.clone();
bin_clone.posterior_prob = posterior_prob;
info!(
"Setting bin {}:{}-{} posterior_prob to: {}",
bin_clone.chrom, bin_clone.start, bin_clone.end, bin_clone.posterior_prob
);
significant.push((bin_clone, posterior_prob));
}
}
Expand Down
4 changes: 2 additions & 2 deletions src/genome.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ impl Genome {
chrom: chromosome.to_string(),
start,
end,
posterior_prob: 1.0, // Initialize posterior probability to 1.0
posterior_prob: 0.0, // Will be set by Bayesian model
});
start += step;
}
Expand Down Expand Up @@ -144,7 +144,7 @@ impl Genome {
chrom: chrom.clone(),
start,
end,
posterior_prob: 1.0, // Initialize posterior probability to 1.0
posterior_prob: 0.0, // Will be set by Bayesian model
});
start += step;
}
Expand Down
21 changes: 16 additions & 5 deletions src/peak_caller.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use crate::cli::Cli;
use crate::genome::Genome;
use anyhow::Result;
use bio::io::bed::{Record, Writer};
use bio::stats::{LogProb, Prob};

use log::info;
use rayon::prelude::*;
use std::collections::HashMap;
Expand Down Expand Up @@ -114,20 +114,21 @@ impl PeakCaller {
}

let mut current_peak = sorted_bins[0].clone();
let mut posterior_probs = vec![current_peak.posterior_prob];

for bin in sorted_bins.iter().skip(1) {
if bin.start <= current_peak.end + max_distance {
current_peak.end = bin.end.max(current_peak.end);

let current_log = LogProb::from(Prob(current_peak.posterior_prob));
let bin_log = LogProb::from(Prob(bin.posterior_prob));
current_peak.posterior_prob = *Prob::from(current_log + bin_log);
posterior_probs.push(bin.posterior_prob);
} else {
current_peak.posterior_prob = median(&mut posterior_probs);
result.push(current_peak.clone());
current_peak = bin.clone();
posterior_probs = vec![current_peak.posterior_prob];
}
}

current_peak.posterior_prob = median(&mut posterior_probs);
result.push(current_peak);
result
})
Expand Down Expand Up @@ -174,3 +175,13 @@ impl Peaks {
self.ranges.len()
}
}

fn median(values: &mut Vec<f64>) -> f64 {
values.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let len = values.len();
if len % 2 == 0 {
(values[len / 2 - 1] + values[len / 2]) / 2.0
} else {
values[len / 2]
}
}
Loading