diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/FragmentIndex.h b/src/openms/include/OpenMS/ANALYSIS/ID/FragmentIndex.h new file mode 100755 index 00000000000..c606f9278fe --- /dev/null +++ b/src/openms/include/OpenMS/ANALYSIS/ID/FragmentIndex.h @@ -0,0 +1,156 @@ +// -------------------------------------------------------------------------- +// OpenMS -- Open-Source Mass Spectrometry +// -------------------------------------------------------------------------- +// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen, +// ETH Zurich, and Freie Universitaet Berlin 2002-2022. +// +// This software is released under a three-clause BSD license: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of any author or any participating institution +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// For a full list of authors, refer to the file AUTHORS. +// -------------------------------------------------------------------------- +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING +// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// -------------------------------------------------------------------------- +// $Maintainer: Chris Bielow $ +// $Authors: Max Alcer, Heike Einsfeld $ +// -------------------------------------------------------------------------- + + +#pragma once + +#include +#include +#include +#include +#include + +#include + +namespace OpenMS +{ + //class MSExperiment; + + /** @brief Creating a two-level tree from a given FASTAFile by generating the theoretical peptides and their b-y-ions and sorting + * them in the outer tree by fragment-m/z and the inner tree by precursor-m/z. Search function to find candidates to experimental MS2 sepctra + * (implementation inspired by https://lazear.github.io/sage/) + * @ingroup ID + */ +class OPENMS_DLLAPI FragmentIndex : public DefaultParamHandler +{ + public: + + /** @brief Storing found candiates from search + */ + struct Candidate + { + size_t peptide_start; ///< Start position of peptide in protein + size_t peptide_end; ///< End position of peptide in protein + size_t protein_index; ///< Index to Entry of std::vector + bool is_modified_; ///< true if candidate is modified + size_t modification_index_; ///< index to modified version of this peptide + }; + + /** @brief Storing vector of found candiates from search with the Index of the MSSpectrum in the MSExperiment + */ + struct CandidatesWithIndex + { + std::vector candidates; ///< Vector of found candidates + size_t spectrum_index; ///< Index to the MSSpectrum in the MSExperiment + CandidatesWithIndex(std::vector&& cand, size_t spectrum_i): candidates(std::move(cand)), spectrum_index(spectrum_i){} + }; + + + /** @brief Sets default parameters + */ + FragmentIndex(); + + /** @brief Builds up the Search Datastructure + * @param entries Input vector of FASTAFile-Entries to base SearchDatastructure on + */ + void build(const std::vector& entries); + + /** @brief Searches Peaks of every MSSpectrum of the MSExperiment in the Database + * @param experiment Input MSExperiment containing of MS2 Spectra (MS1 Spectra has an empty output) + * @param candidates Output vector of found candidates with the index of MSSpectrum in experiment + */ + void search(MSExperiment& experiment, std::vector& candidates) const; + + /** @brief Searches Peaks of the MSSpectrum in the Database + * @param spectrum Input MS2 Spectrum (MS1 Spectra has an empty output) + * @param candidates Output vector of found candidatest + */ + void search(MSSpectrum& spectrum, std::vector& candidates) const; + + protected: + + void updateMembers_() override; + + //Saving b_y_ion after creating theoretical spectrum with index to precursor-peptide + struct Fragment_ + { + size_t peptide_index_; + double fragment_mz_; + Fragment_(size_t prec, const Peak1D& frag):peptide_index_(prec), fragment_mz_(frag.getMZ()){} + }; + + //Saving first and last index of precursor in FASTA-Entry, index to protein and the MonoWeight after digesting + struct Peptide_ + { + size_t peptide_begin_; + size_t peptide_end_; + size_t protein_index_; + double peptide_mz_; + bool is_modified_; + size_t modification_index_; + }; + + /// in-silico digest protein database + std::vector generate_peptides_(const std::vector& entries) const; + /// Merges presorted Chunks of Peptide-Fragments inplace + void fragment_merge_(int first, int last, const std::vector& chunks, std::vector& input) const; + ///generates sorted vector with all theoretical Fragments for all theoretical Peptides + std::vector generate_fragments_(const std::vector& entries) const; + + bool is_build_; + std::string digestor_enzyme_; + size_t missed_cleavages_; ///< number of missed cleavages + double peptide_min_mass_; + double peptide_max_mass_; + size_t peptide_min_length_; + size_t peptide_max_length_; + double fragment_min_mz_; + double fragment_max_mz_; + size_t fragment_min_charge_; + size_t fragment_max_charge_; + size_t bucketsize_; ///< number of fragments per outer node + double precursor_mz_tolerance_; + std::string precursor_mz_tolerance_unit_; + double fragment_mz_tolerance_; + std::string fragment_mz_tolerance_unit_; + StringList modifications_fixed_; + StringList modifications_variable_; + size_t max_variable_mods_per_peptide_; + size_t max_missed_peaks_; + std::vector all_peptides_; + std::vector bucket_frags_mz_; ///< Minimum fragment-m/z for each other node + std::vector all_fragments_; +}; + +} \ No newline at end of file diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h b/src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h new file mode 100755 index 00000000000..3a0be9a069f --- /dev/null +++ b/src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h @@ -0,0 +1,147 @@ +// -------------------------------------------------------------------------- +// OpenMS -- Open-Source Mass Spectrometry +// -------------------------------------------------------------------------- +// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen, +// ETH Zurich, and Freie Universitaet Berlin 2002-2022. +// +// This software is released under a three-clause BSD license: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of any author or any participating institution +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// For a full list of authors, refer to the file AUTHORS. +// -------------------------------------------------------------------------- +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING +// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// -------------------------------------------------------------------------- +// $Maintainer: Chris Bielow $ +// $Authors: Max Alcer, Heike Einsfeld $ +// -------------------------------------------------------------------------- + + +#pragma once + +#include +#include +#include +#include +#include + +#include + +namespace OpenMS +{ + //class MSExperiment; + + /** @brief Creating a two-level tree from a given FASTAFile by generating the theoretical peptides and their b-y-ions and sorting + * them in the outer tree by fragment-m/z and the inner tree by precursor-m/z. Search function to find candidates to experimental MS2 sepctra + * (implementation inspired by https://lazear.github.io/sage/) + * @ingroup ID + */ +class OPENMS_DLLAPI SearchDatabase : public DefaultParamHandler +{ + public: + + /** @brief Storing found candiates from search + */ + struct Candidate + { + const AASequence* sequence; ///< Pointer to AASequence of candidate + size_t protein_index; ///< Index to Entry of std::vector + Candidate(const AASequence* seq, size_t pi) : sequence(seq), protein_index(pi){} + }; + + /** @brief Storing vector of found candiates from search with the Index of the MSSpectrum in the MSExperiment + */ + struct CandidatesWithIndex + { + std::vector candidates; ///< Vector of found candidates + size_t spectrum_index; ///< Index to the MSSpectrum in the MSExperiment + CandidatesWithIndex(const std::vector& cand, size_t spectrum_i): candidates(cand), spectrum_index(spectrum_i){} + }; + + /** @brief Builds up the Search Datastructure + * @param entries Input vector of FASTAFile-Entries to base SearchDatastructure on + */ + SearchDatabase(const std::vector& entries); + + /** @brief Searches Peaks of every MSSpectrum of the MSExperiment in the Database + * @param experiment Input MSExperiment containing of MS2 Spectra (MS1 Spectra has an empty output) + * @param candidates Output vector of found candidates with the index of MSSpectrum in experiment + */ + void search(MSExperiment& experiment, std::vector& candidates) const; + + /** @brief Searches Peaks of the MSSpectrum in the Database + * @param spectrum Input MS2 Spectrum (MS1 Spectra has an empty output) + * @param candidates Output vector of found candidatest + */ + void search(MSSpectrum& spectrum, std::vector& candidates) const; + + protected: + + void updateMembers_() override; + + //Saving b_y_ion after creating theoretical spectrum with index to precursor-peptide + struct Fragment_ + { + size_t peptide_index_; + double fragment_mz_; + Fragment_(size_t prec, const Peak1D& frag):peptide_index_(prec), fragment_mz_(frag.getMZ()){} + }; + + //Saving precursors AASequence (, index to protein and the MonoWeight) after digesting + struct Peptide_ + { + AASequence sequence_; + size_t protein_index_; + double peptide_mz_; + Peptide_(const Peptide_&) = default; + Peptide_(Peptide_&&) = default; + Peptide_(const AASequence& pep, size_t index, double mass):sequence_(pep), protein_index_(index), peptide_mz_(mass){} + Peptide_& operator=(Peptide_&&) = default; + Peptide_& operator=(const Peptide_&) = default; + }; + + /// generates theoretical Peptides from all Proteins in fasta-File + std::vector generate_peptides_(const std::vector& entries) const; + /// Merges presorted Chunks of Peptide-Fragments inplace + void fragment_merge_(int first, int last, const std::vector& chunks, std::vector& input) const; + ///generates sorted vector with all theoretical Fragments for all theoretical Peptides + std::vector generate_fragments_() const; + + std::string digestor_enzyme_; + size_t missed_cleavages_; ///< number of missed cleavages + double peptide_min_mass_; + double peptide_max_mass_; + size_t peptide_min_length_; + size_t peptide_max_length_; + double fragment_min_mz_; + double fragment_max_mz_; + size_t bucketsize_; ///< number of fragments per outer node + double precursor_mz_tolerance_; + std::string precursor_mz_tolerance_unit_; + double fragment_mz_tolerance_; + std::string fragment_mz_tolerance_unit_; + StringList modifications_fixed_; + StringList modifications_variable_; + size_t max_variable_mods_per_peptide_; + std::vector all_peptides_{}; + std::vector bucket_frags_mz_{}; ///< Minimum fragment-m/z for each other node + std::vector all_fragments_{}; +}; + +} \ No newline at end of file diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake b/src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake old mode 100644 new mode 100755 index 5f8165029a2..cdb62e152c7 --- a/src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake +++ b/src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake @@ -21,6 +21,7 @@ ConsensusMapMergerAlgorithm.h FalseDiscoveryRate.h FIAMSDataProcessor.h FIAMSScheduler.h +FragmentIndex.h HiddenMarkovModel.h IDBoostGraph.h IDDecoyProbability.h diff --git a/src/openms/include/OpenMS/CONCEPT/Constants.h b/src/openms/include/OpenMS/CONCEPT/Constants.h old mode 100644 new mode 100755 index d5b5ca0be30..26d34f86962 --- a/src/openms/include/OpenMS/CONCEPT/Constants.h +++ b/src/openms/include/OpenMS/CONCEPT/Constants.h @@ -575,6 +575,16 @@ namespace OpenMS String */ inline const std::string MSM_SUM_FORMULA = "Sum_Formula"; + + /** User parameter for the unit type of the m/z tolerance. (Required for SearchDatabase) + String + */ + inline const std::string UNIT_DA = "Da"; + + /** User parameter for the unit type of the m/z tolerance. (Required for SearchDatabase) + String + */ + inline const std::string UNIT_PPM = "ppm"; } //@} diff --git a/src/openms/source/ANALYSIS/ID/FragmentIndex.cpp b/src/openms/source/ANALYSIS/ID/FragmentIndex.cpp new file mode 100755 index 00000000000..8068bb01707 --- /dev/null +++ b/src/openms/source/ANALYSIS/ID/FragmentIndex.cpp @@ -0,0 +1,388 @@ +// -------------------------------------------------------------------------- +// OpenMS -- Open-Source Mass Spectrometry +// -------------------------------------------------------------------------- +// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen, +// ETH Zurich, and Freie Universitaet Berlin 2002-2022. +// +// This software is released under a three-clause BSD license: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of any author or any participating institution +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// For a full list of authors, refer to the file AUTHORS. +// -------------------------------------------------------------------------- +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING +// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// -------------------------------------------------------------------------- +// $Maintainer: Chris Bielow $ +// $Authors: Max Alcer, Heike Einsfeld $ +// -------------------------------------------------------------------------- + + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +using namespace std; + + +namespace OpenMS +{ + +using namespace Constants::UserParam; +using namespace Math; + +std::vector FragmentIndex::generate_peptides_(const std::vector& entries) const +{ + vector all_peptides; + + ModifiedPeptideGenerator::MapToResidueType fixed_modifications = ModifiedPeptideGenerator::getModifications(modifications_fixed_); + ModifiedPeptideGenerator::MapToResidueType variable_modifications = ModifiedPeptideGenerator::getModifications(modifications_variable_); + + size_t skipped_peptides = 0; + + #pragma omp parallel + { + ProteaseDigestion digestor; + digestor.setEnzyme(digestor_enzyme_); + digestor.setMissedCleavages(missed_cleavages_); + vector all_peptides_pvt; + #pragma omp for nowait + for (size_t i = 0; i < entries.size(); ++i) + { + vector> peptides; + digestor.digestUnmodified(StringView(entries[i].sequence), peptides, peptide_min_length_, peptide_max_length_); + + for (const auto& pep : peptides) + { + if (entries[i].sequence.substr(pep.first, pep.second).find('X') != string::npos) // filtering peptide with unknown AA, can't calculate MonoWeight + { + #pragma omp atomic + ++skipped_peptides; + + continue; + } + + if(!(modifications_fixed_.empty() && modifications_variable_.empty())) + { + vector modified_peptides; + + AASequence modified_pep = AASequence::fromString(entries[i].sequence.substr(pep.first, pep.second)); + + ModifiedPeptideGenerator::applyFixedModifications(fixed_modifications, modified_pep); + ModifiedPeptideGenerator::applyVariableModifications(variable_modifications, modified_pep, max_variable_mods_per_peptide_, modified_peptides); + + for (size_t j = 0; j < modified_peptides.size(); ++j) + { + double mod_seq_mz = modified_peptides[j].getMonoWeight(); + + if (!contains(mod_seq_mz, peptide_min_mass_, peptide_max_mass_)) continue; // Skip peptides out of mass range + + all_peptides_pvt.push_back({pep.first, pep.second, i, mod_seq_mz, true, j}); + } + } + double seq_mz = AASequence::fromString(entries[i].sequence.substr(pep.first, pep.second)).getMonoWeight(); + + if (!contains(seq_mz, peptide_min_mass_, peptide_max_mass_)) continue; // Skip peptides out of mass range + + all_peptides_pvt.push_back({pep.first, pep.second, i, seq_mz, false, ULONG_MAX}); + + } + } + #pragma omp critical + all_peptides.insert(all_peptides.end(), all_peptides_pvt.begin(), all_peptides_pvt.end()); + } + if (skipped_peptides > 0) OPENMS_LOG_WARN << skipped_peptides << " peptides skipped due to unkown AA \n"; + return all_peptides; +} + +void FragmentIndex::fragment_merge_(int first, int last, const std::vector& chunks, std::vector& input) const +{ + if (last - first > 1) + { + int mid = first + (last-first) / 2; + #pragma omp parallel sections + { + #pragma omp section + FragmentIndex::fragment_merge_(first, mid, chunks, input); + #pragma omp section + FragmentIndex::fragment_merge_(mid, last, chunks, input); + } + std::inplace_merge(input.begin() + chunks[first], input.begin() + chunks[mid], input.begin() + chunks[last], + [&](const FragmentIndex::Fragment_& l, const FragmentIndex::Fragment_& r)-> bool + { + return (tie(l.fragment_mz_, all_peptides_[l.peptide_index_].peptide_mz_) < + tie(r.fragment_mz_, all_peptides_[r.peptide_index_].peptide_mz_)); + }); + } +} + +std::vector FragmentIndex::generate_fragments_(const std::vector& entries) const +{ + TheoreticalSpectrumGenerator tsg; + PeakSpectrum b_y_ions; + std::vector all_frags; + std::vector chunk_start = {0}; //for fragment_merge need start and end of presorted chunks + + for (size_t i = 0; i < all_peptides_.size(); ++i) + { + tsg.getSpectrum(b_y_ions, AASequence::fromString(entries[all_peptides_[i].protein_index_].sequence.substr(all_peptides_[i].peptide_begin_, all_peptides_[i].peptide_end_)), + fragment_min_charge_, fragment_max_charge_); + for (const auto& frag : b_y_ions) + { + if (!contains(frag.getMZ(), fragment_min_mz_, fragment_max_mz_)) continue; + all_frags.emplace_back(i, frag); + } + chunk_start.emplace_back(all_frags.size()); + b_y_ions.clear(true); + } + + FragmentIndex::fragment_merge_(0, chunk_start.size()-1, chunk_start, all_frags); + + return all_frags; +} + +FragmentIndex::FragmentIndex() : DefaultParamHandler("FragmentIndex") +{ + vector all_enzymes; + ProteaseDB::getInstance()->getAllNames(all_enzymes); + vector all_mods; + ModificationsDB::getInstance()->getAllSearchModifications(all_mods); + all_mods.push_back({}); + vector tolerance_units{UNIT_DA, UNIT_PPM}; + defaults_.setValue("digestor_enzyme", "Trypsin", "Enzyme for digestion"); + defaults_.setValidStrings("digestor_enzyme", ListUtils::create(all_enzymes)); + defaults_.setValue("missed_cleavages", 0, "Missed cleavages for digestion"); + defaults_.setValue("peptide_min_mass", 500, "Minimal peptide mass for database"); + defaults_.setValue("peptide_max_mass", 5000, "Maximal peptide mass for database"); + defaults_.setValue("peptide_min_length", 5, "Minimal peptide length for database"); + defaults_.setValue("peptide_max_length", 50, "Maximal peptide length for database"); + defaults_.setValue("fragment_min_mz", 150, "Minimal fragment mz for database"); + defaults_.setValue("fragment_max_mz", 2000, "Maximal fragment mz for database"); + defaults_.setValue("fragment_min_charge", 1, "Minimal fragment charge for generation of theorethical Spectrum"); + defaults_.setValue("fragment_max_charge", 1, "Maximal fragment charge for generation of theorethical Spectrum"); + defaults_.setValue("precursor_mz_tolerance", 2.0, "Tolerance for precursor-m/z in search"); + defaults_.setValue("fragment_mz_tolerance", 0.05, "Tolerance for fragment-m/z in search"); + defaults_.setValue("precursor_mz_tolerance_unit", UNIT_DA, "Unit of tolerance for precursor-m/z"); + defaults_.setValidStrings("precursor_mz_tolerance_unit", tolerance_units); + defaults_.setValue("fragment_mz_tolerance_unit", UNIT_DA, "Unit of tolerance for fragment-m/z"); + defaults_.setValidStrings("fragment_mz_tolerance_unit", tolerance_units); + defaults_.setValue("max_missed_peaks", 5, "If this number of the highest peaks in a spectrum is not found, the spectrum gets skipped"); + defaults_.setValue("modifications_fixed", std::vector{"Carbamidomethyl (C)"}, "Fixed modifications, specified using UniMod (www.unimod.org) terms, e.g. 'Carbamidomethyl (C)'"); + defaults_.setValidStrings("modifications_fixed", ListUtils::create(all_mods)); + defaults_.setValue("modifications_variable", std::vector{"Oxidation (M)"}, "Variable modifications, specified using UniMod (www.unimod.org) terms, e.g. 'Oxidation (M)'"); + defaults_.setValidStrings("modifications_variable", ListUtils::create(all_mods)); + defaults_.setValue("max_variable_mods_per_peptide", 2, "Maximum number of residues carrying a variable modification per candidate peptide"); + + defaultsToParam_(); + + is_build_ = false; +} + +void FragmentIndex::build(const std::vector& entries) +{ + if (entries.empty()) return; + + all_peptides_ = generate_peptides_(entries); + + all_fragments_ = generate_fragments_(entries); + + bucketsize_ = size_t(sqrt(all_fragments_.size())); //calculating bucketsize for balanced tree (performance) + + for (size_t i = 0; i < all_fragments_.size(); i += bucketsize_) + { + bucket_frags_mz_.emplace_back(all_fragments_[i].fragment_mz_); //store lower m/z border of buckets + } + // sorting the fragments of each bucket by the precursor-m/z + #pragma omp parallel for + for (size_t i = 0; i < all_fragments_.size(); i += bucketsize_) + { + auto bucket_start = all_fragments_.begin() + i; + auto bucket_end = bucket_start + min(bucketsize_, size_t(all_fragments_.end() - bucket_start)); // Last bucket may be smaller then bucketsize -> end of bucket is all_fragments_.end() + + sort(bucket_start, bucket_end, + [&](const FragmentIndex::Fragment_& l, const FragmentIndex::Fragment_& r) -> bool + {return (all_peptides_[l.peptide_index_].peptide_mz_ < all_peptides_[r.peptide_index_].peptide_mz_);}); + } + + is_build_ = true; +} + +void FragmentIndex::updateMembers_() +{ + digestor_enzyme_ = param_.getValue("digestor_enzyme").toString(); + missed_cleavages_ = param_.getValue("missed_cleavages"); + peptide_min_mass_ = param_.getValue("peptide_min_mass"); + peptide_max_mass_ = param_.getValue("peptide_max_mass"); + peptide_min_length_ = param_.getValue("peptide_min_length"); + peptide_max_length_ = param_.getValue("peptide_max_length"); + fragment_min_mz_ = param_.getValue("fragment_min_mz"); + fragment_max_mz_ = param_.getValue("fragment_max_mz"); + fragment_min_charge_ = param_.getValue("fragment_min_charge"); + fragment_max_charge_ = param_.getValue("fragment_max_charge"); + precursor_mz_tolerance_ = param_.getValue("precursor_mz_tolerance"); + fragment_mz_tolerance_ = param_.getValue("fragment_mz_tolerance"); + precursor_mz_tolerance_unit_ = param_.getValue("precursor_mz_tolerance_unit").toString(); + fragment_mz_tolerance_unit_ = param_.getValue("fragment_mz_tolerance_unit").toString(); + max_missed_peaks_ = param_.getValue("max_missed_peaks"); + modifications_fixed_ = ListUtils::toStringList(param_.getValue("modifications_fixed")); + modifications_variable_ = ListUtils::toStringList(param_.getValue("modifications_variable")); + max_variable_mods_per_peptide_ = param_.getValue("max_variable_mods_per_peptide"); +} + +void FragmentIndex::search(MSSpectrum& spectrum, std::vector& candidates) const +{ + if(!is_build_) + { + OPENMS_LOG_WARN << "FragmentIndex not yet build \n"; + return; + } + candidates.clear(); + if (spectrum.empty() || (spectrum.getMSLevel() != 2)) return; + + unordered_set index_hash; // saving every candidate only once + + const std::vector& precursors = spectrum.getPrecursors(); + + if (precursors.size() != 1) + { + OPENMS_LOG_WARN << "Number of precursors does not match MS level \n"; + return; + } + + double prec_mz = precursors[0].getUnchargedMass(); + + spectrum.sortByIntensity(true); + size_t count_found = 0; + size_t count_rounds = 0; + + auto prec_unit_da = precursor_mz_tolerance_unit_ == UNIT_DA; + auto frag_unit_da = fragment_mz_tolerance_unit_ == UNIT_DA; + + for (const auto& peak : spectrum) + { + if (count_found == 0 && count_rounds == max_missed_peaks_) return; // if top x peaks with the highest intensity don't match, skip whole spectrum + + double new_frag_mz_tolerance = frag_unit_da ? fragment_mz_tolerance_ : ppmToMassAbs(fragment_mz_tolerance_, peak.getMZ()); + + auto itr_lower = lower_bound(bucket_frags_mz_.begin(), bucket_frags_mz_.end(), peak.getMZ() - new_frag_mz_tolerance); + + size_t index_lower = distance(bucket_frags_mz_.begin(), itr_lower); + + if (index_lower != 0) index_lower--; // lower bound returns one too high (because searching the minimum of each bucket) + + while (bucket_frags_mz_[index_lower] <= (peak.getMZ() + new_frag_mz_tolerance) && index_lower < bucket_frags_mz_.size()) + { + auto bucket_start = all_fragments_.begin() + (index_lower * bucketsize_); + auto bucket_end = bucket_start + min(bucketsize_, size_t(all_fragments_.end() - bucket_start)); // Last bucket may be smaller then bucketsize -> end of bucket is all_fragments_.end() + + double new_prec_mz_tolerance = prec_unit_da ? precursor_mz_tolerance_ : ppmToMassAbs(precursor_mz_tolerance_, prec_mz); + + auto itr_lower_inner = lower_bound(bucket_start, bucket_end, prec_mz - new_prec_mz_tolerance, + [&](const FragmentIndex::Fragment_& r, double l) { + return (all_peptides_[r.peptide_index_].peptide_mz_ < l); + }); + + while (all_peptides_[(*itr_lower_inner).peptide_index_].peptide_mz_ <= (prec_mz + new_prec_mz_tolerance) && itr_lower_inner != bucket_end && itr_lower_inner != all_fragments_.end()) + { + double theoretical_peak_tolerance = frag_unit_da ? fragment_mz_tolerance_ : ppmToMassAbs(fragment_mz_tolerance_, (*itr_lower_inner).fragment_mz_); + // filter by matching fragment-m/z + if (!contains(peak.getMZ(), (*itr_lower_inner).fragment_mz_ - theoretical_peak_tolerance, (*itr_lower_inner).fragment_mz_ + theoretical_peak_tolerance)) + { + ++itr_lower_inner; + continue; + } + index_hash.insert((*itr_lower_inner).peptide_index_); + ++itr_lower_inner; + count_found++; + } + index_lower++; + } + count_rounds++; + } + + for (size_t j : index_hash) + { + candidates.push_back( + { + all_peptides_[j].peptide_begin_, + all_peptides_[j].peptide_end_, + all_peptides_[j].protein_index_, + all_peptides_[j].is_modified_, + all_peptides_[j].modification_index_ + }); + } +} + +void FragmentIndex::search(MSExperiment& experiment, std::vector& candidates) const +{ + if(!is_build_) + { + OPENMS_LOG_WARN << "FragmentIndex not yet build \n"; + return; + } + candidates.clear(); + if (experiment.empty()) return; + + #pragma omp parallel + { + #pragma omp for + for (size_t i = 0; i < experiment.size(); ++i) + { + if (experiment[i].empty()) continue; + + vector temp_cand; + + FragmentIndex::search(experiment[i], temp_cand); + + #pragma omp critical + { + candidates.emplace_back(move(temp_cand), i); + } + } + } +} + +} // end namespace OpenMS + + + + + diff --git a/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp b/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp new file mode 100755 index 00000000000..e6a05683645 --- /dev/null +++ b/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp @@ -0,0 +1,336 @@ +// -------------------------------------------------------------------------- +// OpenMS -- Open-Source Mass Spectrometry +// -------------------------------------------------------------------------- +// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen, +// ETH Zurich, and Freie Universitaet Berlin 2002-2022. +// +// This software is released under a three-clause BSD license: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of any author or any participating institution +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// For a full list of authors, refer to the file AUTHORS. +// -------------------------------------------------------------------------- +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING +// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// -------------------------------------------------------------------------- +// $Maintainer: Chris Bielow $ +// $Authors: Max Alcer, Heike Einsfeld $ +// -------------------------------------------------------------------------- + + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +using namespace std; + + +namespace OpenMS +{ + +using namespace Constants::UserParam; +using namespace Math; + +std::vector SearchDatabase::generate_peptides_(const std::vector& entries) const +{ + vector all_peptides; + + ModifiedPeptideGenerator::MapToResidueType fixed_modifications = ModifiedPeptideGenerator::getModifications(modifications_fixed_); + ModifiedPeptideGenerator::MapToResidueType variable_modifications = ModifiedPeptideGenerator::getModifications(modifications_variable_); + + #pragma omp parallel + { + ProteaseDigestion digestor; + digestor.setEnzyme(digestor_enzyme_); + digestor.setMissedCleavages(missed_cleavages_); + vector all_peptides_pvt; + #pragma omp for nowait + for (size_t i = 0; i < entries.size(); i++) + { + vector peptides; + digestor.digest(AASequence::fromString(entries[i].sequence), peptides, peptide_min_length_, peptide_max_length_); + + for (const auto& pep : peptides) + { + if (pep.toString().find('X') != string::npos) continue; // filtering peptide with unknown AA, can't calculate MonoWeight + + vector modified_peptides; + + AASequence modified_pep = pep; + + ModifiedPeptideGenerator::applyFixedModifications(fixed_modifications, modified_pep); + ModifiedPeptideGenerator::applyVariableModifications(variable_modifications, modified_pep, max_variable_mods_per_peptide_, modified_peptides); + + for (const auto & mod_pep : modified_peptides) + { + double mod_seq_mz = mod_pep.getMonoWeight(); + + if (!contains(mod_seq_mz, peptide_min_mass_, peptide_max_mass_)) continue; + + all_peptides_pvt.emplace_back(mod_pep, i, mod_seq_mz); + } + + double seq_mz = pep.getMonoWeight(); + + if (!contains(seq_mz, peptide_min_mass_, peptide_max_mass_)) continue; + + all_peptides_pvt.emplace_back(pep, i, seq_mz); + + } + } + #pragma omp critical (add_critical) + all_peptides.insert(all_peptides.end(), all_peptides_pvt.begin(), all_peptides_pvt.end()); + } + return all_peptides; +} + +void SearchDatabase::fragment_merge_(int first, int last, const std::vector& chunks, std::vector& input) const +{ + if (last - first > 1) + { + int mid = first + (last-first) / 2; + #pragma omp parallel sections + { + #pragma omp section + SearchDatabase::fragment_merge_(first, mid, chunks, input); + #pragma omp section + SearchDatabase::fragment_merge_(mid, last, chunks, input); + } + std::inplace_merge(input.begin() + chunks[first], input.begin() + chunks[mid], input.begin() + chunks[last], + [&](const SearchDatabase::Fragment_& l, const SearchDatabase::Fragment_& r)-> bool + {return (tie(l.fragment_mz_, all_peptides_[l.peptide_index_].peptide_mz_) < tie(r.fragment_mz_, all_peptides_[r.peptide_index_].peptide_mz_));}); + } +} + +std::vector SearchDatabase::generate_fragments_() const +{ + TheoreticalSpectrumGenerator tsg; + PeakSpectrum b_y_ions; + std::vector all_frags; + std::vector chunk_start = {0}; //for fragment_merge need start and end of presorted chunks + + for (size_t i = 0; i < all_peptides_.size(); i++) + { + tsg.getSpectrum(b_y_ions, all_peptides_[i].sequence_, 1, 1); + for (const auto& frag : b_y_ions) + { + if (!contains(frag.getMZ(), fragment_min_mz_, fragment_max_mz_)) continue; + all_frags.emplace_back(i, frag); + } + chunk_start.emplace_back(all_frags.size()); + b_y_ions.clear(true); + } + + SearchDatabase::fragment_merge_(0, chunk_start.size()-1, chunk_start, all_frags); + + return all_frags; +} + +SearchDatabase::SearchDatabase(const std::vector& entries) : DefaultParamHandler("SearchDatabase") +{ + vector all_enzymes; + ProteaseDB::getInstance()->getAllNames(all_enzymes); + vector all_mods; + ModificationsDB::getInstance()->getAllSearchModifications(all_mods); + vector tolerance_units{UNIT_DA, UNIT_PPM}; + defaults_.setValue("digestor_enzyme", "Trypsin", "Enzyme for digestion"); + defaults_.setValidStrings("digestor_enzyme", ListUtils::create(all_enzymes)); + defaults_.setValue("missed_cleavages", 0, "Missed cleavages for digestion"); + defaults_.setValue("peptide_min_mass", 500, "Minimal peptide mass for database"); + defaults_.setValue("peptide_max_mass", 5000, "Maximal peptide mass for database"); + defaults_.setValue("peptide_min_length", 5, "Minimal peptide length for database"); + defaults_.setValue("peptide_max_length", 50, "Maximal peptide length for database"); + defaults_.setValue("fragment_min_mz", 150, "Minimal fragment mz for database"); + defaults_.setValue("fragment_max_mz", 2000, "Maximal fragment mz for database"); + defaults_.setValue("precursor_mz_tolerance", 2.0, "Tolerance for precursor-m/z in search"); + defaults_.setValue("fragment_mz_tolerance", 0.05, "Tolerance for fragment-m/z in search"); + defaults_.setValue("precursor_mz_tolerance_unit", UNIT_DA, "Unit of tolerance for precursor-m/z"); + defaults_.setValidStrings("precursor_mz_tolerance_unit", tolerance_units); + defaults_.setValue("fragment_mz_tolerance_unit", UNIT_DA, "Unit of tolerance for fragment-m/z"); + defaults_.setValidStrings("fragment_mz_tolerance_unit", tolerance_units); + defaults_.setValue("modifications_fixed", std::vector{"Carbamidomethyl (C)"}, "Fixed modifications, specified using UniMod (www.unimod.org) terms, e.g. 'Carbamidomethyl (C)'"); + defaults_.setValidStrings("modifications_fixed", ListUtils::create(all_mods)); + defaults_.setValue("modifications_variable", std::vector{"Oxidation (M)"}, "Variable modifications, specified using UniMod (www.unimod.org) terms, e.g. 'Oxidation (M)'"); + defaults_.setValidStrings("modifications_variable", ListUtils::create(all_mods)); + defaults_.setValue("max_variable_mods_per_peptide", 2, "Maximum number of residues carrying a variable modification per candidate peptide"); + + defaultsToParam_(); + + if (entries.size() == 0) return; + + all_peptides_ = generate_peptides_(entries); + + all_fragments_ = generate_fragments_(); + + bucketsize_ = size_t(sqrt(all_fragments_.size())); //calculating bucketsize for balanced tree (performance) + + for (size_t i = 0; i < all_fragments_.size(); i += bucketsize_) + { + bucket_frags_mz_.emplace_back(all_fragments_[i].fragment_mz_); + } + // sorting the fragments of each bucket by the precursor-m/z + #pragma omp parallel for + for (size_t i = 0; i < all_fragments_.size(); i += bucketsize_) + { + auto bucket_begin = all_fragments_.begin() + i; + auto condition = distance(all_fragments_.begin(), bucket_begin + bucketsize_) >= int(all_fragments_.size()); + + sort(bucket_begin, condition ? all_fragments_.end() : bucket_begin+bucketsize_, + [&](const SearchDatabase::Fragment_& l, const SearchDatabase::Fragment_& r) -> bool + {return (all_peptides_[l.peptide_index_].peptide_mz_ < all_peptides_[r.peptide_index_].peptide_mz_);}); + } +} + +void SearchDatabase::updateMembers_() +{ + digestor_enzyme_ = param_.getValue("digestor_enzyme").toString(); + missed_cleavages_ = param_.getValue("missed_cleavages"); + peptide_min_mass_ = param_.getValue("peptide_min_mass"); + peptide_max_mass_ = param_.getValue("peptide_max_mass"); + peptide_min_length_ = param_.getValue("peptide_min_length"); + peptide_max_length_ = param_.getValue("peptide_max_length"); + fragment_min_mz_ = param_.getValue("fragment_min_mz"); + fragment_max_mz_ = param_.getValue("fragment_max_mz"); + precursor_mz_tolerance_ = param_.getValue("precursor_mz_tolerance"); + fragment_mz_tolerance_ = param_.getValue("fragment_mz_tolerance"); + precursor_mz_tolerance_unit_ = param_.getValue("precursor_mz_tolerance_unit").toString(); + fragment_mz_tolerance_unit_ = param_.getValue("fragment_mz_tolerance_unit").toString(); + modifications_fixed_ = ListUtils::toStringList(param_.getValue("modifications_fixed")); + modifications_variable_ = ListUtils::toStringList(param_.getValue("modifications_variable")); + max_variable_mods_per_peptide_ = param_.getValue("max_variable_mods_per_peptide"); +} + +void SearchDatabase::search(MSSpectrum& spectrum, std::vector& candidates) const +{ + candidates.clear(); + if (spectrum.empty()) return; + + unordered_set index_hash; // saving every candidate only once + + std::vector precursor = spectrum.getPrecursors(); + + if (precursor.size() != 1) return; + + double prec_mz = precursor[0].getUnchargedMass(); + + spectrum.sortByIntensity(true); + size_t count_found = 0; + size_t count_rounds = 0; + + auto prec_unit_da = precursor_mz_tolerance_unit_ == UNIT_DA; + auto frag_unit_da = fragment_mz_tolerance_unit_ == UNIT_DA; + + for (const auto& peak : spectrum) + { + if (count_found == 0 && count_rounds == 5) return; // if peak with the 5 highest intensity don't match, skip whole spectrum + + double new_frag_mz_tolerance = frag_unit_da ? fragment_mz_tolerance_ : ppmToMassAbs(fragment_mz_tolerance_, peak.getMZ()); + + auto itr_lower = lower_bound(bucket_frags_mz_.begin(), bucket_frags_mz_.end(), peak.getMZ() - new_frag_mz_tolerance); + + size_t index_lower = distance(bucket_frags_mz_.begin(), itr_lower); + + if (index_lower != 0) index_lower--; // lower bound returns one too high (because searching the minimum of each bucket) + + while (bucket_frags_mz_[index_lower] <= (peak.getMZ() + new_frag_mz_tolerance) && index_lower < bucket_frags_mz_.size()) + { + auto bucket_start = all_fragments_.begin() + (index_lower * bucketsize_); + auto bucket_end = all_fragments_.begin() + (index_lower + 1) * bucketsize_; + + auto condition = distance(all_fragments_.begin(), bucket_end) >= int(all_fragments_.size()); + + double new_prec_mz_tolerance = prec_unit_da ? precursor_mz_tolerance_ : ppmToMassAbs(precursor_mz_tolerance_, prec_mz); + + auto itr_lower_inner = lower_bound(bucket_start, condition ? all_fragments_.end() : bucket_end, prec_mz - new_prec_mz_tolerance, + [&](const SearchDatabase::Fragment_& r, double l) { + return (all_peptides_[r.peptide_index_].peptide_mz_ < l); + }); + + while (all_peptides_[(*itr_lower_inner).peptide_index_].peptide_mz_ <= (prec_mz + new_prec_mz_tolerance) && itr_lower_inner != bucket_end && itr_lower_inner != all_fragments_.end()) + { + double theoretical_peak_tolerance = frag_unit_da ? fragment_mz_tolerance_ : ppmToMassAbs(fragment_mz_tolerance_, (*itr_lower_inner).fragment_mz_); + // filter by matching fragment-m/z + if (!contains(peak.getMZ(), (*itr_lower_inner).fragment_mz_ - theoretical_peak_tolerance, (*itr_lower_inner).fragment_mz_ + theoretical_peak_tolerance)) + { + ++itr_lower_inner; + continue; + } + index_hash.insert((*itr_lower_inner).peptide_index_); + ++itr_lower_inner; + count_found++; + } + index_lower++; + } + count_rounds++; + } + + for (size_t j : index_hash) + { + candidates.emplace_back(&all_peptides_[j].sequence_, all_peptides_[j].protein_index_); + } +} + +void SearchDatabase::search(MSExperiment& experiment, std::vector& candidates) const +{ + candidates.clear(); + if (experiment.size() == 0) return; + + #pragma omp parallel + { + #pragma omp for + for (size_t i = 0; i < experiment.size(); i++) + { + if (experiment[i].empty()) continue; + + vector temp_cand; + + SearchDatabase::search(experiment[i], temp_cand); + + #pragma omp critical (add_critical) + { + candidates.emplace_back(temp_cand, i); + } + } + } +} + +} // end namespace OpenMS + + + + + diff --git a/src/openms/source/ANALYSIS/ID/sources.cmake b/src/openms/source/ANALYSIS/ID/sources.cmake old mode 100644 new mode 100755 index 70ecff7564e..6a71fa12258 --- a/src/openms/source/ANALYSIS/ID/sources.cmake +++ b/src/openms/source/ANALYSIS/ID/sources.cmake @@ -21,6 +21,7 @@ ConsensusMapMergerAlgorithm.cpp FalseDiscoveryRate.cpp FIAMSDataProcessor.cpp FIAMSScheduler.cpp +FragmentIndex.cpp HiddenMarkovModel.cpp IDBoostGraph.cpp IDConflictResolverAlgorithm.cpp diff --git a/src/tests/class_tests/openms/executables.cmake b/src/tests/class_tests/openms/executables.cmake old mode 100644 new mode 100755 index 5680cea3537..6c115908b77 --- a/src/tests/class_tests/openms/executables.cmake +++ b/src/tests/class_tests/openms/executables.cmake @@ -501,6 +501,7 @@ set(analysis_executables_list FIAMSScheduler_test FLASHDeconvAlgorithm_test FLASHDeconvHelperStructs_test + FragmentIndex_test HiddenMarkovModel_test IDBoostGraph_test IDMapper_test @@ -577,6 +578,7 @@ set(analysis_executables_list ReactionMonitoringTransition_test RNPxlModificationsGenerator_test SVMWrapper_test + SearchDatabase_speed_test SimpleSearchEngineAlgorithm_test SimplePairFinder_test SimpleSVM_test diff --git a/src/tests/class_tests/openms/source/FragmentIndex_test.cpp b/src/tests/class_tests/openms/source/FragmentIndex_test.cpp new file mode 100755 index 00000000000..f13090af831 --- /dev/null +++ b/src/tests/class_tests/openms/source/FragmentIndex_test.cpp @@ -0,0 +1,365 @@ +// -------------------------------------------------------------------------- +// OpenMS -- Open-Source Mass Spectrometry +// -------------------------------------------------------------------------- +// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen, +// ETH Zurich, and Freie Universitaet Berlin 2002-2022. +// +// This software is released under a three-clause BSD license: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of any author or any participating institution +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// For a full list of authors, refer to the file AUTHORS. +// -------------------------------------------------------------------------- +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING +// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// -------------------------------------------------------------------------- +// $Maintainer: Chris Bielow $ +// $Authors: Max Alcer, Heike Einsfeld $ +// -------------------------------------------------------------------------- + +#include +#include +#include + +#include +#include +#include +#include +#include + +using namespace OpenMS; +using namespace Constants::UserParam; +using namespace std; + +class FragmentIndex_test : public FragmentIndex +{ + public: + + using FragmentIndex::FragmentIndex; + + std::vector getAllFragments(){return all_fragments_;} + + bool isSortedBucketFragsMZ() + { + return is_sorted(bucket_frags_mz_.begin(), bucket_frags_mz_.end()); + } + + bool isSortedAllFragments() + { + bool test_sorted = true; + + for (size_t i = 0; i < all_fragments_.size(); i += bucketsize_) + { + auto bucket_begin = all_fragments_.begin()+i; + auto condition = distance(all_fragments_.begin(), bucket_begin+bucketsize_) >= int(all_fragments_.size()); + + test_sorted &= is_sorted(bucket_begin, condition ? all_fragments_.end() : bucket_begin+bucketsize_, + [&](const Fragment_& l, const Fragment_& r) -> bool + {return (all_peptides_[l.peptide_index_].peptide_mz_ < all_peptides_[r.peptide_index_].peptide_mz_);}); + } + return test_sorted; + } + +}; + +START_TEST(FragmentIndex, "$Id:$") + + +START_SECTION(build(const std::vector& entries)) + + std::vector entries{ + {"test1", "test1", "LRLRACGLNFADLMARQGLY"}, + {"test2", "test2", "AAASPPLLRCLVLTGFGGYD"}, + {"test3", "test3", "KVKLQSRPAAPPAPGPGQLT"}}; + + FragmentIndex_test sdb; + + START_SECTION(test number of fragments) + sdb.build(entries); + TEST_EQUAL(187, sdb.getAllFragments().size()) + END_SECTION + + START_SECTION(test sortation) + TEST_TRUE(sdb.isSortedBucketFragsMZ()) + + TEST_TRUE(sdb.isSortedAllFragments()) + END_SECTION + + START_SECTION(test without modifications) + + auto params = sdb.getParameters(); + + params.setValue("modifications_fixed", std::vector{}); + params.setValue("modifications_variable", std::vector{}); + + sdb.setParameters(params); + + sdb.build(entries); + TEST_NOT_EQUAL(187, sdb.getAllFragments().size()); + + END_SECTION + + START_SECTION(test peptide mz bounds) + + auto params = sdb.getParameters(); + + params.setValue("peptide_min_mass", 2000); + params.setValue("peptide_max_mass", 2000); + + sdb.setParameters(params); + + sdb.build(entries); + TEST_EQUAL(0, sdb.getAllFragments().size()); + + END_SECTION + + START_SECTION(test fragment mz bounds) + + auto params = sdb.getParameters(); + + params.setValue("fragment_min_mz", 500); + params.setValue("fragment_max_mz", 500); + + sdb.setParameters(params); + + sdb.build(entries); + TEST_EQUAL(0, sdb.getAllFragments().size()); + + END_SECTION + +END_SECTION + +START_SECTION(void search(MSSpectrum& spectrum, std::vector& candidates)) + + cout << "\n"; + + std::vector entries{ + {"test1", "test1", "LRLRACGLNFADLMARQGLY"}, + {"test2", "test2", "AAASPPLLRCLVLTGFGGYD"}, + {"test3", "test3", "KVKLQSRPAAPPAPGPGQLT"}}; + + FragmentIndex_test sdb; + std::vector candidates; + + MSSpectrum spec; + + Precursor prec{}; + + START_SECTION(searching without building) + + prec.setCharge(1); + + prec.setMZ(1281.6); + + spec.setPrecursors({prec}); + spec.setMSLevel(2); + + spec.push_back({605.308, 100}); + spec.push_back({676.345, 100}); + spec.push_back({823.413, 100}); + + sdb.search(spec, candidates); + + TEST_EQUAL(candidates.size(), 0) + + END_SECTION + + sdb.build(entries); + + START_SECTION(Searching 3 Fragments it should find (with Da and ppm)) + + prec.setCharge(1); + + prec.setMZ(1281.6); + + spec.setPrecursors({prec}); + spec.setMSLevel(2); + + spec.push_back({605.308, 100}); + spec.push_back({676.345, 100}); + spec.push_back({823.413, 100}); + + sdb.search(spec, candidates); + + TEST_EQUAL(candidates.size(), 1) + + auto params = sdb.getParameters(); + + params.setValue("fragment_mz_tolerance_unit", UNIT_PPM); + params.setValue("fragment_mz_tolerance", 5.f); + params.setValue("precursor_mz_tolerance_unit", UNIT_PPM); + params.setValue("precursor_mz_tolerance", 50.f); + + sdb.setParameters(params); + + sdb.search(spec, candidates); + + TEST_EQUAL(candidates.size(), 1) + + END_SECTION + + START_SECTION(Searching Fragment it should not find because of Fragment Mass) + + auto params = sdb.getParameters(); + + params.setValue("fragment_mz_tolerance_unit", UNIT_DA); + params.setValue("fragment_mz_tolerance", 0.05); + params.setValue("precursor_mz_tolerance_unit", UNIT_DA); + params.setValue("precursor_mz_tolerance", 2.0); + + sdb.setParameters(params); + + spec.clear(false); + + spec.push_back({1040, 100}); + + sdb.search(spec, candidates); + + TEST_EQUAL(candidates.size(), 0) + + END_SECTION + + START_SECTION(Searching Fragment it should not find because of Precursor Mass) + + spec.clear(true); + + prec.setMZ(1500); + + spec.setPrecursors({prec}); + spec.setMSLevel(2); + + spec.push_back({572.304, 100}); + + sdb.search(spec, candidates); + + TEST_EQUAL(candidates.size(), 0) + + END_SECTION + + START_SECTION(Searching Fragment it should not find because its smaller then all Fragments in Database) + + spec.clear(false); + + spec.push_back({100, 100}); + + sdb.search(spec, candidates); + + TEST_EQUAL(candidates.size(), 0) + + END_SECTION + + START_SECTION(Searching Fragment it should not find because its bigger then all Fragments in Database) + + spec.clear(false); + + spec.push_back({2000, 100}); + + sdb.search(spec, candidates); + + TEST_EQUAL(candidates.size(), 0) + + END_SECTION + + START_SECTION(Testing filtering of Spectrum by best Peaks) + + spec.clear(true); + + prec.setMZ(1281.6); + + spec.setPrecursors({prec}); + spec.setMSLevel(2); + + spec.push_back({2000, 80}); + + spec.push_back({1500, 99}); + + spec.push_back({500, 85}); + + spec.push_back({1000, 90}); + + spec.push_back({100, 100}); + + spec.push_back({937.456, 5}); + + sdb.search(spec, candidates); + + TEST_EQUAL(candidates.size(), 0) + + END_SECTION + +END_SECTION + +START_SECTION(void search(MSExperiment& experiment, std::vector& candidates)) + + std::vector entries{ + {"test1", "test1", "LRLRACGLNFADLMARQGLY"}, + {"test2", "test2", "AAASPPLLRCLVLTGFGGYD"}, + {"test3", "test3", "KVKLQSRPAAPPAPGPGQLT"}}; + + FragmentIndex_test sdb; + + sdb.build(entries); + + MSExperiment exp; + + MSSpectrum spec; + + Precursor prec{}; + + prec.setCharge(1); + + prec.setMZ(1281.6); + + spec.setPrecursors({prec}); + spec.setMSLevel(2); + + spec.push_back({605.318, 100}); + + exp.addSpectrum(spec); + + spec.clear(true); + + prec.setMZ(894.529); + + spec.setPrecursors({prec}); + spec.setMSLevel(2); + + spec.push_back({175.119, 100}); + + exp.addSpectrum(spec); + + spec.clear(true); + + prec.setMZ(1655.89); + + spec.setPrecursors({prec}); + spec.setMSLevel(2); + + spec.push_back({1544.83, 100}); + + exp.addSpectrum(spec); + + std::vector candidates; + + sdb.search(exp, candidates); + + TEST_EQUAL(candidates.size(), 3) + +END_SECTION + +END_TEST \ No newline at end of file diff --git a/src/tests/class_tests/openms/source/SearchDatabase_test.cpp b/src/tests/class_tests/openms/source/SearchDatabase_test.cpp new file mode 100755 index 00000000000..78769abfeba --- /dev/null +++ b/src/tests/class_tests/openms/source/SearchDatabase_test.cpp @@ -0,0 +1,293 @@ +// -------------------------------------------------------------------------- +// OpenMS -- Open-Source Mass Spectrometry +// -------------------------------------------------------------------------- +// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen, +// ETH Zurich, and Freie Universitaet Berlin 2002-2022. +// +// This software is released under a three-clause BSD license: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of any author or any participating institution +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// For a full list of authors, refer to the file AUTHORS. +// -------------------------------------------------------------------------- +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING +// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// -------------------------------------------------------------------------- +// $Maintainer: Chris Bielow $ +// $Authors: Max Alcer, Heike Einsfeld $ +// -------------------------------------------------------------------------- + +#include +#include +#include + +#include +#include +#include +#include +#include + +using namespace OpenMS; +using namespace Constants::UserParam; +using namespace std; + +class SearchDatabase_test : public SearchDatabase +{ + public: + + using SearchDatabase::SearchDatabase; + + std::vector getAllFragments(){return all_fragments_;} + + bool isSortedBucketFragsMZ() + { + return is_sorted(bucket_frags_mz_.begin(), bucket_frags_mz_.end()); + } + + bool isSortedAllFragments() + { + bool test_sorted = true; + + for (size_t i = 0; i < all_fragments_.size(); i += bucketsize_) + { + auto bucket_begin = all_fragments_.begin()+i; + auto condition = distance(all_fragments_.begin(), bucket_begin+bucketsize_) >= int(all_fragments_.size()); + + test_sorted &= is_sorted(bucket_begin, condition ? all_fragments_.end() : bucket_begin+bucketsize_, + [&](const Fragment_& l, const Fragment_& r) -> bool + {return (all_peptides_[l.peptide_index_].peptide_mz_ < all_peptides_[r.peptide_index_].peptide_mz_);}); + } + return test_sorted; + } + +}; + +START_TEST(SearchDatabase, "$Id:$") + + +START_SECTION(SearchDatabase(const std::vector& entries)) + + std::vector entries{ + {"test1", "test1", "LRLRACGLNFADLMARQGLY"}, + {"test2", "test2", "AAASPPLLRCLVLTGFGGYD"}, + {"test3", "test3", "KVKLQSRPAAPPAPGPGQLT"}}; + + SearchDatabase_test sdb(entries); + START_SECTION(test number of fragments) + TEST_EQUAL(187, sdb.getAllFragments().size()) + END_SECTION + + START_SECTION(test sortation) + TEST_TRUE(sdb.isSortedBucketFragsMZ()) + + TEST_TRUE(sdb.isSortedAllFragments()) + END_SECTION + +END_SECTION + +START_SECTION(void search(MSSpectrum& spectrum, std::vector& candidates)) + + cout << "\n"; + + std::vector entries{ + {"test1", "test1", "LRLRACGLNFADLMARQGLY"}, + {"test2", "test2", "AAASPPLLRCLVLTGFGGYD"}, + {"test3", "test3", "KVKLQSRPAAPPAPGPGQLT"}}; + + SearchDatabase_test sdb(entries); + + MSSpectrum spec; + + Precursor prec{}; + + std::vector candidates; + + START_SECTION(Searching 3 Fragments it should find (with Da and ppm)) + + prec.setCharge(1); + + prec.setMZ(1281.6); + + spec.setPrecursors({prec}); + + spec.push_back({605.308, 100}); + spec.push_back({676.345, 100}); + spec.push_back({823.413, 100}); + + sdb.search(spec, candidates); + + TEST_EQUAL(candidates.size(), 1) + + auto params = sdb.getParameters(); + + params.setValue("fragment_mz_tolerance_unit", UNIT_PPM); + params.setValue("fragment_mz_tolerance", 5.f); + params.setValue("precursor_mz_tolerance_unit", UNIT_PPM); + params.setValue("precursor_mz_tolerance", 50.f); + + sdb.setParameters(params); + + sdb.search(spec, candidates); + + TEST_EQUAL(candidates.size(), 1) + + END_SECTION + + START_SECTION(Searching Fragment it should not find because of Fragment Mass) + + auto params = sdb.getParameters(); + + params.setValue("fragment_mz_tolerance_unit", UNIT_DA); + params.setValue("fragment_mz_tolerance", 0.05); + params.setValue("precursor_mz_tolerance_unit", UNIT_DA); + params.setValue("precursor_mz_tolerance", 2.0); + + sdb.setParameters(params); + + spec.clear(false); + + spec.push_back({1040, 100}); + + sdb.search(spec, candidates); + + TEST_EQUAL(candidates.size(), 0) + + END_SECTION + + START_SECTION(Searching Fragment it should not find because of Precursor Mass) + + spec.clear(true); + + prec.setMZ(1500); + + spec.setPrecursors({prec}); + + spec.push_back({572.304, 100}); + + sdb.search(spec, candidates); + + TEST_EQUAL(candidates.size(), 0) + + END_SECTION + + START_SECTION(Searching Fragment it should not find because its smaller then all Fragments in Database) + + spec.clear(false); + + spec.push_back({100, 100}); + + sdb.search(spec, candidates); + + TEST_EQUAL(candidates.size(), 0) + + END_SECTION + + START_SECTION(Searching Fragment it should not find because its bigger then all Fragments in Database) + + spec.clear(false); + + spec.push_back({2000, 100}); + + sdb.search(spec, candidates); + + TEST_EQUAL(candidates.size(), 0) + + END_SECTION + + START_SECTION(Testing filtering of Spectrum by best Peaks) + + spec.clear(true); + + prec.setMZ(1281.6); + + spec.setPrecursors({prec}); + + spec.push_back({2000, 80}); + + spec.push_back({1500, 99}); + + spec.push_back({500, 85}); + + spec.push_back({1000, 90}); + + spec.push_back({100, 100}); + + spec.push_back({937.456, 5}); + + sdb.search(spec, candidates); + + TEST_EQUAL(candidates.size(), 0) + + END_SECTION + +END_SECTION + +START_SECTION(void search(MSExperiment& experiment, std::vector& candidates)) + + std::vector entries{ + {"test1", "test1", "LRLRACGLNFADLMARQGLY"}, + {"test2", "test2", "AAASPPLLRCLVLTGFGGYD"}, + {"test3", "test3", "KVKLQSRPAAPPAPGPGQLT"}}; + + SearchDatabase_test sdb(entries); + + MSExperiment exp; + + MSSpectrum spec; + + Precursor prec{}; + + prec.setCharge(1); + + prec.setMZ(1281.6); + + spec.setPrecursors({prec}); + + spec.push_back({605.318, 100}); + + exp.addSpectrum(spec); + + spec.clear(true); + + prec.setMZ(894.529); + + spec.setPrecursors({prec}); + + spec.push_back({175.119, 100}); + + exp.addSpectrum(spec); + + spec.clear(true); + + prec.setMZ(1655.89); + + spec.setPrecursors({prec}); + + spec.push_back({1544.83, 100}); + + exp.addSpectrum(spec); + + std::vector candidates; + + sdb.search(exp, candidates); + + TEST_EQUAL(candidates.size(), 3) + +END_SECTION + +END_TEST \ No newline at end of file