From 345cf0d773bc53b9883724e4b5a149edd4cd145c Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Tue, 15 Oct 2024 10:23:43 +0200 Subject: [PATCH 1/4] sort bins for evaluation --- ms2deepscore/models/loss_functions.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ms2deepscore/models/loss_functions.py b/ms2deepscore/models/loss_functions.py index 9a2ae2b0..62513595 100644 --- a/ms2deepscore/models/loss_functions.py +++ b/ms2deepscore/models/loss_functions.py @@ -109,6 +109,10 @@ def bin_dependent_losses(predictions, """ if predictions.shape != true_values.shape: raise ValueError("Expected true values and predictions to have the same shape") + + # Make sure bins are sorted + ref_score_bins = sorted(ref_score_bins, key=lambda x: x[0]) + bin_content = [] losses = {"bin": []} for loss_type in loss_types: From eb79dc6186906d92a2cef6ac9f77847b1d360895 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Tue, 15 Oct 2024 11:03:38 +0200 Subject: [PATCH 2/4] sort bins by occupation --- .../inchikey_pair_selection.py | 49 +++++++++++++------ 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/ms2deepscore/train_new_model/inchikey_pair_selection.py b/ms2deepscore/train_new_model/inchikey_pair_selection.py index 7da11638..e70b03a5 100644 --- a/ms2deepscore/train_new_model/inchikey_pair_selection.py +++ b/ms2deepscore/train_new_model/inchikey_pair_selection.py @@ -39,16 +39,24 @@ def select_compound_pairs_wrapper( settings.same_prob_bins, settings.include_diagonal) - aimed_nr_of_pairs_per_bin = determine_aimed_nr_of_pairs_per_bin(available_pairs_per_bin_matrix, - settings, - nr_of_inchikeys=len(inchikeys14_unique)) - - pair_frequency_matrixes = balanced_selection_of_pairs_per_bin(available_pairs_per_bin_matrix, - settings.max_pair_resampling, - aimed_nr_of_pairs_per_bin) - - selected_pairs_per_bin = convert_to_selected_pairs_list(pair_frequency_matrixes, available_pairs_per_bin_matrix, - available_scores_per_bin_matrix, inchikeys14_unique) + aimed_nr_of_pairs_per_bin, bin_priorities = determine_aimed_nr_of_pairs_per_bin( + available_pairs_per_bin_matrix, + settings, + nr_of_inchikeys=len(inchikeys14_unique) + ) + + pair_frequency_matrixes = balanced_selection_of_pairs_per_bin( + available_pairs_per_bin_matrix, + settings.max_pair_resampling, + aimed_nr_of_pairs_per_bin + ) + + selected_pairs_per_bin = convert_to_selected_pairs_list( + pair_frequency_matrixes, + available_pairs_per_bin_matrix, + available_scores_per_bin_matrix, + inchikeys14_unique + ) return [pair for pairs in selected_pairs_per_bin for pair in pairs] @@ -143,12 +151,19 @@ def compute_jaccard_similarity_per_bin( def determine_aimed_nr_of_pairs_per_bin(available_pairs_per_bin_matrix, settings, nr_of_inchikeys): """Determines the aimed_nr_of_pairs_per_bin. - If the settings given are higher than the highest possible number of pairs it is lowered to that""" + + If the settings given are higher than the highest possible number of pairs it is lowered to that. + """ # Select the nr_of_pairs_per_bin to use nr_of_available_pairs_per_bin = get_nr_of_available_pairs_in_bin(available_pairs_per_bin_matrix) lowest_max_number_of_pairs = min(nr_of_available_pairs_per_bin) * settings.max_pair_resampling print(f"The available nr of pairs per bin are: {nr_of_available_pairs_per_bin}") + + # Set bin priority from lowest to highest no. of available pairs + bin_priority = np.argsort(nr_of_available_pairs_per_bin) + print(f"Bin priorities will be orderd accordingly: {[settings.same_prob_bins[i] for i in bin_priority]}") + aimed_nr_of_pairs_per_bin = settings.average_pairs_per_bin * nr_of_inchikeys if lowest_max_number_of_pairs < aimed_nr_of_pairs_per_bin: print(f"Warning: The average_pairs_per_bin: {settings.average_pairs_per_bin} cannot be reached, " @@ -158,13 +173,14 @@ def determine_aimed_nr_of_pairs_per_bin(available_pairs_per_bin_matrix, settings f"Instead the lowest number of available pairs in a bin times the resampling is used, " f"which is: {lowest_max_number_of_pairs}") aimed_nr_of_pairs_per_bin = lowest_max_number_of_pairs - return aimed_nr_of_pairs_per_bin + return aimed_nr_of_pairs_per_bin, bin_priority def balanced_selection_of_pairs_per_bin( available_pairs_per_bin_matrix: np.ndarray, max_pair_resampling: int, - nr_of_pairs_per_bin: int + nr_of_pairs_per_bin: int, + bin_priority: np.ndarray = None, ) -> np.ndarray: """From the available_pairs_per_bin_matrix a balanced selection is made to have a balanced distribution. @@ -190,11 +206,16 @@ def balanced_selection_of_pairs_per_bin( Resampling means that the exact same inchikey pair is added multiple times to the list of pairs. nr_of_pairs_per_bin: The number of pairs that should be sampled for each tanimoto bin. + bin_priority: + Bins will be processed in the order given in bin_priority. Default is set to None in which case no change + to the order will be done. """ + if bin_priority is None: + bin_priority = np.arange(0, available_pairs_per_bin_matrix.shape[0]) inchikey_count = np.zeros(available_pairs_per_bin_matrix.shape[1]) pair_frequency_matrixes = [] - for pairs_in_bin in available_pairs_per_bin_matrix: + for pairs_in_bin in available_pairs_per_bin_matrix[bin_priority]: pair_frequencies, inchikey_count = select_balanced_pairs(pairs_in_bin, inchikey_count, nr_of_pairs_per_bin, From bd5837dbec219b4e5f9470797637418db4965e07 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Tue, 15 Oct 2024 11:40:35 +0200 Subject: [PATCH 3/4] add occupation based bin ordering --- .../train_new_model/inchikey_pair_selection.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/ms2deepscore/train_new_model/inchikey_pair_selection.py b/ms2deepscore/train_new_model/inchikey_pair_selection.py index e70b03a5..d18da37e 100644 --- a/ms2deepscore/train_new_model/inchikey_pair_selection.py +++ b/ms2deepscore/train_new_model/inchikey_pair_selection.py @@ -46,15 +46,15 @@ def select_compound_pairs_wrapper( ) pair_frequency_matrixes = balanced_selection_of_pairs_per_bin( - available_pairs_per_bin_matrix, + available_pairs_per_bin_matrix[bin_priorities, :], settings.max_pair_resampling, - aimed_nr_of_pairs_per_bin + aimed_nr_of_pairs_per_bin, ) selected_pairs_per_bin = convert_to_selected_pairs_list( pair_frequency_matrixes, - available_pairs_per_bin_matrix, - available_scores_per_bin_matrix, + available_pairs_per_bin_matrix[bin_priorities, :], + available_scores_per_bin_matrix[bin_priorities, :], inchikeys14_unique ) return [pair for pairs in selected_pairs_per_bin for pair in pairs] @@ -180,7 +180,6 @@ def balanced_selection_of_pairs_per_bin( available_pairs_per_bin_matrix: np.ndarray, max_pair_resampling: int, nr_of_pairs_per_bin: int, - bin_priority: np.ndarray = None, ) -> np.ndarray: """From the available_pairs_per_bin_matrix a balanced selection is made to have a balanced distribution. @@ -206,16 +205,11 @@ def balanced_selection_of_pairs_per_bin( Resampling means that the exact same inchikey pair is added multiple times to the list of pairs. nr_of_pairs_per_bin: The number of pairs that should be sampled for each tanimoto bin. - bin_priority: - Bins will be processed in the order given in bin_priority. Default is set to None in which case no change - to the order will be done. """ - if bin_priority is None: - bin_priority = np.arange(0, available_pairs_per_bin_matrix.shape[0]) inchikey_count = np.zeros(available_pairs_per_bin_matrix.shape[1]) pair_frequency_matrixes = [] - for pairs_in_bin in available_pairs_per_bin_matrix[bin_priority]: + for pairs_in_bin in available_pairs_per_bin_matrix: pair_frequencies, inchikey_count = select_balanced_pairs(pairs_in_bin, inchikey_count, nr_of_pairs_per_bin, From c66ae9a3eb0fb2853cace2c6c9c32b4194e388f5 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Tue, 15 Oct 2024 11:40:56 +0200 Subject: [PATCH 4/4] remove pylint remains --- ms2deepscore/MS2DeepScoreMonteCarlo.py | 1 - ms2deepscore/benchmarking/plot_ridgeline.py | 2 -- ms2deepscore/tensorize_spectra.py | 1 - 3 files changed, 4 deletions(-) diff --git a/ms2deepscore/MS2DeepScoreMonteCarlo.py b/ms2deepscore/MS2DeepScoreMonteCarlo.py index 21ad8618..7702248c 100644 --- a/ms2deepscore/MS2DeepScoreMonteCarlo.py +++ b/ms2deepscore/MS2DeepScoreMonteCarlo.py @@ -76,7 +76,6 @@ def __init__(self, model, Set to True to monitor the embedding creating with a progress bar. Default is False. """ - # pylint: disable=too-many-arguments self.model = model if self.model.encoder.dropout.p == 0: raise TypeError("Monte Carlo Dropout is not supposed to be used with a model where dropout-rate=0.") diff --git a/ms2deepscore/benchmarking/plot_ridgeline.py b/ms2deepscore/benchmarking/plot_ridgeline.py index 04c7d9b6..6116f956 100644 --- a/ms2deepscore/benchmarking/plot_ridgeline.py +++ b/ms2deepscore/benchmarking/plot_ridgeline.py @@ -30,7 +30,6 @@ def create_combined_ridgeline_plot(reference_scores, compare_score_name Label string. The default is "MS2DeepScore". """ - # pylint: disable=too-many-arguments histograms, used_bins, _, _ = calculate_histograms(reference_scores, comparison_scores, n_bins, min_resolution, max_resolution) @@ -115,7 +114,6 @@ def score_histogram(scores, n_bins, ax=None, ylabel="scores"): def calculate_histograms(reference_scores, comparison_scores, n_bins=10, min_resolution=20, max_resolution=100): """Calcualte a series of histograms, one for every bin.""" - # pylint: disable=too-many-locals def get_hist_bins(resolution): hist_bins = np.linspace(0, 1, resolution) hist_bins = np.concatenate((hist_bins, np.array([2.0]))) diff --git a/ms2deepscore/tensorize_spectra.py b/ms2deepscore/tensorize_spectra.py index 11cecad1..cb44abc4 100644 --- a/ms2deepscore/tensorize_spectra.py +++ b/ms2deepscore/tensorize_spectra.py @@ -33,7 +33,6 @@ def tensorize_spectra( @numba.jit(nopython=True) def vectorize_spectrum(mz_array, intensities_array, min_mz, max_mz, mz_bin_width, intensity_scaling): """Fast function to convert mz and intensity arrays into dense spectrum vector.""" - # pylint: disable=too-many-arguments num_bins = int((max_mz - min_mz) / mz_bin_width) vector = np.zeros((num_bins)) for mz, intensity in zip(mz_array, intensities_array):