-
Notifications
You must be signed in to change notification settings - Fork 66
Title: Add index serialization to parquet format (--save-index, --load-index, --validate-index) #203
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Title: Add index serialization to parquet format (--save-index, --load-index, --validate-index) #203
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -88,50 +88,98 @@ impl Runner { | |
| pub fn new(parameters: Search, parallel: usize) -> anyhow::Result<Self> { | ||
| let mut parameters = parameters.clone(); | ||
| let start = Instant::now(); | ||
| let fasta = sage_cloudpath::util::read_fasta( | ||
| ¶meters.database.fasta, | ||
| ¶meters.database.decoy_tag, | ||
| parameters.database.generate_decoys, | ||
| ) | ||
| .with_context(|| { | ||
| format!( | ||
| "Failed to build database from `{}`", | ||
| parameters.database.fasta | ||
|
|
||
| // Check if we should load a pre-built index | ||
| let database = if let Some(ref load_path) = parameters.load_index { | ||
| info!("Loading pre-built index from {}", load_path); | ||
| let path = load_path.parse::<CloudPath>()?; | ||
| let loaded_db = sage_cloudpath::index_parquet::deserialize_index(&path) | ||
| .with_context(|| format!("Failed to load index from `{}`", load_path))?; | ||
|
|
||
| info!( | ||
| "loaded {} fragments, {} peptides in {:#?}", | ||
| loaded_db.fragments.len(), | ||
| loaded_db.peptides.len(), | ||
| start.elapsed() | ||
| ); | ||
|
|
||
| // Optional validation: build from FASTA and compare | ||
| if parameters.validate_index { | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Rather than have a separate |
||
| info!("Validating loaded index against FASTA build..."); | ||
| let fasta = sage_cloudpath::util::read_fasta( | ||
| ¶meters.database.fasta, | ||
| ¶meters.database.decoy_tag, | ||
| parameters.database.generate_decoys, | ||
| ) | ||
| .with_context(|| { | ||
| format!("Failed to read FASTA from `{}`", parameters.database.fasta) | ||
| })?; | ||
| let built_db = parameters.database.clone().build(fasta); | ||
| sage_cloudpath::index_parquet::validate_index(&built_db, &loaded_db) | ||
| .with_context(|| "Index validation failed")?; | ||
| info!("Index validation passed!"); | ||
| } | ||
|
|
||
| loaded_db | ||
| } else { | ||
| // Build from FASTA as usual | ||
| let fasta = sage_cloudpath::util::read_fasta( | ||
| ¶meters.database.fasta, | ||
| ¶meters.database.decoy_tag, | ||
| parameters.database.generate_decoys, | ||
| ) | ||
| })?; | ||
|
|
||
| let database = match parameters.database.prefilter { | ||
| false => parameters.database.clone().build(fasta), | ||
| true => { | ||
| parameters | ||
| .database | ||
| .auto_calculate_prefilter_chunk_size(&fasta); | ||
| if parameters.database.prefilter_chunk_size >= fasta.targets.len() { | ||
| parameters.database.clone().build(fasta) | ||
| } else { | ||
| info!( | ||
| "using {} db chunks of size {}", | ||
| (fasta.targets.len() + parameters.database.prefilter_chunk_size - 1) | ||
| / parameters.database.prefilter_chunk_size, | ||
| parameters.database.prefilter_chunk_size, | ||
| ); | ||
| let mini_runner = Self { | ||
| database: IndexedDatabase::default(), | ||
| parameters: parameters.clone(), | ||
| start, | ||
| }; | ||
| let peptides = mini_runner.prefilter_peptides(parallel, fasta); | ||
| parameters.database.clone().build_from_peptides(peptides) | ||
| .with_context(|| { | ||
| format!( | ||
| "Failed to build database from `{}`", | ||
| parameters.database.fasta | ||
| ) | ||
| })?; | ||
|
|
||
| let built_db = match parameters.database.prefilter { | ||
| false => parameters.database.clone().build(fasta), | ||
| true => { | ||
| parameters | ||
| .database | ||
| .auto_calculate_prefilter_chunk_size(&fasta); | ||
| if parameters.database.prefilter_chunk_size >= fasta.targets.len() { | ||
| parameters.database.clone().build(fasta) | ||
| } else { | ||
| info!( | ||
| "using {} db chunks of size {}", | ||
| (fasta.targets.len() + parameters.database.prefilter_chunk_size - 1) | ||
| / parameters.database.prefilter_chunk_size, | ||
| parameters.database.prefilter_chunk_size, | ||
| ); | ||
| let mini_runner = Self { | ||
| database: IndexedDatabase::default(), | ||
| parameters: parameters.clone(), | ||
| start, | ||
| }; | ||
| let peptides = mini_runner.prefilter_peptides(parallel, fasta); | ||
| parameters.database.clone().build_from_peptides(peptides) | ||
| } | ||
| } | ||
| }; | ||
|
|
||
| info!( | ||
| "generated {} fragments, {} peptides in {:#?}", | ||
| built_db.fragments.len(), | ||
| built_db.peptides.len(), | ||
| start.elapsed() | ||
| ); | ||
|
|
||
| // Save index if requested | ||
| if let Some(ref save_path) = parameters.save_index { | ||
| info!("Saving index to {}", save_path); | ||
| let path = save_path.parse::<CloudPath>()?; | ||
| sage_cloudpath::index_parquet::serialize_index(&built_db, &path) | ||
| .with_context(|| format!("Failed to save index to `{}`", save_path))?; | ||
| info!("Index saved successfully"); | ||
| } | ||
|
|
||
| built_db | ||
| }; | ||
|
|
||
| info!( | ||
| "generated {} fragments, {} peptides in {:#?}", | ||
| database.fragments.len(), | ||
| database.peptides.len(), | ||
| (start.elapsed()) | ||
| ); | ||
| Ok(Self { | ||
| database, | ||
| parameters, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think these could be consolidated to a single CLI option. If the file is not present, we generate the database and then serialize it. If the database is present, we use the serialized version