Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Speedup on the generation of databases when large number of peptides are redundant.
- Initial support for searching diaPASEF data
- `override_precursor_charge` setting that forces multiple charge states to be searched
- Index serialization to parquet format (`--save-index`, `--load-index`, `--validate-index`)
### Breaking Changes
- `precursor_ppm` field reports the non-absoluted average mass error, rather than the absoluted average mass error.
- Don't deisotope reporter ion regions if MS2-based TMT/iTRAQ is used
Expand Down
53 changes: 51 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ If you use Sage in a scientific publication, please cite the following paper:
- Configuration by [JSON file](https://sage-docs.vercel.app/docs/configuration#file)
- Built-in support for reading gzipped-mzML files
- Support for reading/writing directly from [AWS S3](https://sage-docs.vercel.app/docs/configuration/aws)
- Index serialization to parquet for fast reloading (`--save-index`, `--load-index`)

## Interoperability

Expand Down
30 changes: 30 additions & 0 deletions crates/sage-cli/src/input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,15 @@ pub struct Search {
pub annotate_matches: bool,

pub score_type: ScoreType,

#[serde(skip_serializing)]
pub save_index: Option<String>,

#[serde(skip_serializing)]
pub load_index: Option<String>,

#[serde(skip_serializing)]
pub validate_index: bool,
}

#[derive(Deserialize)]
Expand Down Expand Up @@ -76,6 +85,13 @@ pub struct Input {
pub write_pin: Option<bool>,
pub write_report: Option<bool>,
pub score_type: Option<ScoreType>,

#[serde(skip)]
pub save_index: Option<String>,
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think these could be consolidated to a single CLI option. If the file is not present, we generate the database and then serialize it. If the database is present, we use the serialized version

#[serde(skip)]
pub load_index: Option<String>,
#[serde(skip)]
pub validate_index: Option<bool>,
}

#[derive(Serialize, Deserialize, Debug)]
Expand Down Expand Up @@ -213,6 +229,17 @@ impl Input {
input.annotate_matches = Some(annotate_matches);
}

// Index serialization options
if let Some(save_index) = matches.get_one::<String>("save-index") {
input.save_index = Some(save_index.clone());
}
if let Some(load_index) = matches.get_one::<String>("load-index") {
input.load_index = Some(load_index.clone());
}
if let Some(validate_index) = matches.get_one::<bool>("validate-index").copied() {
input.validate_index = Some(validate_index);
}

// avoid to later panic if these parameters are not set (but doesn't check if files exist)

ensure!(
Expand Down Expand Up @@ -343,6 +370,9 @@ impl Input {
bruker_config: self.bruker_config.unwrap_or_default(),
write_report: self.write_report.unwrap_or(false),
score_type,
save_index: self.save_index,
load_index: self.load_index,
validate_index: self.validate_index.unwrap_or(false),
})
}
}
Expand Down
20 changes: 20 additions & 0 deletions crates/sage-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,26 @@ fn main() -> anyhow::Result<()> {
.action(clap::ArgAction::SetFalse)
.help("Disable sending telemetry data"),
)
.arg(
Arg::new("save-index")
.long("save-index")
.value_parser(clap::builder::NonEmptyStringValueParser::new())
.help("Save the built index to a parquet directory for reuse")
.value_hint(ValueHint::DirPath),
)
.arg(
Arg::new("load-index")
.long("load-index")
.value_parser(clap::builder::NonEmptyStringValueParser::new())
.help("Load a pre-built index from parquet directory (skips FASTA processing)")
.value_hint(ValueHint::DirPath),
)
.arg(
Arg::new("validate-index")
.long("validate-index")
.action(clap::ArgAction::SetTrue)
.help("Validate loaded index matches what would be built from FASTA"),
)
.help_template(
"{usage-heading} {usage}\n\n\
{about-with-newline}\n\
Expand Down
126 changes: 87 additions & 39 deletions crates/sage-cli/src/runner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,50 +88,98 @@ impl Runner {
pub fn new(parameters: Search, parallel: usize) -> anyhow::Result<Self> {
let mut parameters = parameters.clone();
let start = Instant::now();
let fasta = sage_cloudpath::util::read_fasta(
&parameters.database.fasta,
&parameters.database.decoy_tag,
parameters.database.generate_decoys,
)
.with_context(|| {
format!(
"Failed to build database from `{}`",
parameters.database.fasta

// Check if we should load a pre-built index
let database = if let Some(ref load_path) = parameters.load_index {
info!("Loading pre-built index from {}", load_path);
let path = load_path.parse::<CloudPath>()?;
let loaded_db = sage_cloudpath::index_parquet::deserialize_index(&path)
.with_context(|| format!("Failed to load index from `{}`", load_path))?;

info!(
"loaded {} fragments, {} peptides in {:#?}",
loaded_db.fragments.len(),
loaded_db.peptides.len(),
start.elapsed()
);

// Optional validation: build from FASTA and compare
if parameters.validate_index {
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather than have a separate validate_index parameter, it seems like the most reasonable thing to do is always validate. I would suggest that instead of regenerating the index, the serialized index should store a some kind of hash/checksum of the fasta file along with the enzymatic digestion parameters.

info!("Validating loaded index against FASTA build...");
let fasta = sage_cloudpath::util::read_fasta(
&parameters.database.fasta,
&parameters.database.decoy_tag,
parameters.database.generate_decoys,
)
.with_context(|| {
format!("Failed to read FASTA from `{}`", parameters.database.fasta)
})?;
let built_db = parameters.database.clone().build(fasta);
sage_cloudpath::index_parquet::validate_index(&built_db, &loaded_db)
.with_context(|| "Index validation failed")?;
info!("Index validation passed!");
}

loaded_db
} else {
// Build from FASTA as usual
let fasta = sage_cloudpath::util::read_fasta(
&parameters.database.fasta,
&parameters.database.decoy_tag,
parameters.database.generate_decoys,
)
})?;

let database = match parameters.database.prefilter {
false => parameters.database.clone().build(fasta),
true => {
parameters
.database
.auto_calculate_prefilter_chunk_size(&fasta);
if parameters.database.prefilter_chunk_size >= fasta.targets.len() {
parameters.database.clone().build(fasta)
} else {
info!(
"using {} db chunks of size {}",
(fasta.targets.len() + parameters.database.prefilter_chunk_size - 1)
/ parameters.database.prefilter_chunk_size,
parameters.database.prefilter_chunk_size,
);
let mini_runner = Self {
database: IndexedDatabase::default(),
parameters: parameters.clone(),
start,
};
let peptides = mini_runner.prefilter_peptides(parallel, fasta);
parameters.database.clone().build_from_peptides(peptides)
.with_context(|| {
format!(
"Failed to build database from `{}`",
parameters.database.fasta
)
})?;

let built_db = match parameters.database.prefilter {
false => parameters.database.clone().build(fasta),
true => {
parameters
.database
.auto_calculate_prefilter_chunk_size(&fasta);
if parameters.database.prefilter_chunk_size >= fasta.targets.len() {
parameters.database.clone().build(fasta)
} else {
info!(
"using {} db chunks of size {}",
(fasta.targets.len() + parameters.database.prefilter_chunk_size - 1)
/ parameters.database.prefilter_chunk_size,
parameters.database.prefilter_chunk_size,
);
let mini_runner = Self {
database: IndexedDatabase::default(),
parameters: parameters.clone(),
start,
};
let peptides = mini_runner.prefilter_peptides(parallel, fasta);
parameters.database.clone().build_from_peptides(peptides)
}
}
};

info!(
"generated {} fragments, {} peptides in {:#?}",
built_db.fragments.len(),
built_db.peptides.len(),
start.elapsed()
);

// Save index if requested
if let Some(ref save_path) = parameters.save_index {
info!("Saving index to {}", save_path);
let path = save_path.parse::<CloudPath>()?;
sage_cloudpath::index_parquet::serialize_index(&built_db, &path)
.with_context(|| format!("Failed to save index to `{}`", save_path))?;
info!("Index saved successfully");
}

built_db
};

info!(
"generated {} fragments, {} peptides in {:#?}",
database.fragments.len(),
database.peptides.len(),
(start.elapsed())
);
Ok(Self {
database,
parameters,
Expand Down
3 changes: 3 additions & 0 deletions crates/sage-cloudpath/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,6 @@ serde_json = "1.0"

sage-core = { path = "../sage" }
parquet = { version = "50.0.0", optional = true, default-features = false, features = ["zstd"] }

[dev-dependencies]
tempfile = "3"
Loading