diff --git a/ms2deepscore/MS2DeepScoreMonteCarlo.py b/ms2deepscore/MS2DeepScoreMonteCarlo.py index 21ad8618..7702248c 100644 --- a/ms2deepscore/MS2DeepScoreMonteCarlo.py +++ b/ms2deepscore/MS2DeepScoreMonteCarlo.py @@ -76,7 +76,6 @@ def __init__(self, model, Set to True to monitor the embedding creating with a progress bar. Default is False. """ - # pylint: disable=too-many-arguments self.model = model if self.model.encoder.dropout.p == 0: raise TypeError("Monte Carlo Dropout is not supposed to be used with a model where dropout-rate=0.") diff --git a/ms2deepscore/benchmarking/plot_ridgeline.py b/ms2deepscore/benchmarking/plot_ridgeline.py index 04c7d9b6..6116f956 100644 --- a/ms2deepscore/benchmarking/plot_ridgeline.py +++ b/ms2deepscore/benchmarking/plot_ridgeline.py @@ -30,7 +30,6 @@ def create_combined_ridgeline_plot(reference_scores, compare_score_name Label string. The default is "MS2DeepScore". """ - # pylint: disable=too-many-arguments histograms, used_bins, _, _ = calculate_histograms(reference_scores, comparison_scores, n_bins, min_resolution, max_resolution) @@ -115,7 +114,6 @@ def score_histogram(scores, n_bins, ax=None, ylabel="scores"): def calculate_histograms(reference_scores, comparison_scores, n_bins=10, min_resolution=20, max_resolution=100): """Calcualte a series of histograms, one for every bin.""" - # pylint: disable=too-many-locals def get_hist_bins(resolution): hist_bins = np.linspace(0, 1, resolution) hist_bins = np.concatenate((hist_bins, np.array([2.0]))) diff --git a/ms2deepscore/models/loss_functions.py b/ms2deepscore/models/loss_functions.py index 9a2ae2b0..62513595 100644 --- a/ms2deepscore/models/loss_functions.py +++ b/ms2deepscore/models/loss_functions.py @@ -109,6 +109,10 @@ def bin_dependent_losses(predictions, """ if predictions.shape != true_values.shape: raise ValueError("Expected true values and predictions to have the same shape") + + # Make sure bins are sorted + ref_score_bins = sorted(ref_score_bins, key=lambda x: x[0]) + bin_content = [] losses = {"bin": []} for loss_type in loss_types: diff --git a/ms2deepscore/tensorize_spectra.py b/ms2deepscore/tensorize_spectra.py index 11cecad1..cb44abc4 100644 --- a/ms2deepscore/tensorize_spectra.py +++ b/ms2deepscore/tensorize_spectra.py @@ -33,7 +33,6 @@ def tensorize_spectra( @numba.jit(nopython=True) def vectorize_spectrum(mz_array, intensities_array, min_mz, max_mz, mz_bin_width, intensity_scaling): """Fast function to convert mz and intensity arrays into dense spectrum vector.""" - # pylint: disable=too-many-arguments num_bins = int((max_mz - min_mz) / mz_bin_width) vector = np.zeros((num_bins)) for mz, intensity in zip(mz_array, intensities_array): diff --git a/ms2deepscore/train_new_model/inchikey_pair_selection.py b/ms2deepscore/train_new_model/inchikey_pair_selection.py index 7da11638..d18da37e 100644 --- a/ms2deepscore/train_new_model/inchikey_pair_selection.py +++ b/ms2deepscore/train_new_model/inchikey_pair_selection.py @@ -39,16 +39,24 @@ def select_compound_pairs_wrapper( settings.same_prob_bins, settings.include_diagonal) - aimed_nr_of_pairs_per_bin = determine_aimed_nr_of_pairs_per_bin(available_pairs_per_bin_matrix, - settings, - nr_of_inchikeys=len(inchikeys14_unique)) - - pair_frequency_matrixes = balanced_selection_of_pairs_per_bin(available_pairs_per_bin_matrix, - settings.max_pair_resampling, - aimed_nr_of_pairs_per_bin) - - selected_pairs_per_bin = convert_to_selected_pairs_list(pair_frequency_matrixes, available_pairs_per_bin_matrix, - available_scores_per_bin_matrix, inchikeys14_unique) + aimed_nr_of_pairs_per_bin, bin_priorities = determine_aimed_nr_of_pairs_per_bin( + available_pairs_per_bin_matrix, + settings, + nr_of_inchikeys=len(inchikeys14_unique) + ) + + pair_frequency_matrixes = balanced_selection_of_pairs_per_bin( + available_pairs_per_bin_matrix[bin_priorities, :], + settings.max_pair_resampling, + aimed_nr_of_pairs_per_bin, + ) + + selected_pairs_per_bin = convert_to_selected_pairs_list( + pair_frequency_matrixes, + available_pairs_per_bin_matrix[bin_priorities, :], + available_scores_per_bin_matrix[bin_priorities, :], + inchikeys14_unique + ) return [pair for pairs in selected_pairs_per_bin for pair in pairs] @@ -143,12 +151,19 @@ def compute_jaccard_similarity_per_bin( def determine_aimed_nr_of_pairs_per_bin(available_pairs_per_bin_matrix, settings, nr_of_inchikeys): """Determines the aimed_nr_of_pairs_per_bin. - If the settings given are higher than the highest possible number of pairs it is lowered to that""" + + If the settings given are higher than the highest possible number of pairs it is lowered to that. + """ # Select the nr_of_pairs_per_bin to use nr_of_available_pairs_per_bin = get_nr_of_available_pairs_in_bin(available_pairs_per_bin_matrix) lowest_max_number_of_pairs = min(nr_of_available_pairs_per_bin) * settings.max_pair_resampling print(f"The available nr of pairs per bin are: {nr_of_available_pairs_per_bin}") + + # Set bin priority from lowest to highest no. of available pairs + bin_priority = np.argsort(nr_of_available_pairs_per_bin) + print(f"Bin priorities will be orderd accordingly: {[settings.same_prob_bins[i] for i in bin_priority]}") + aimed_nr_of_pairs_per_bin = settings.average_pairs_per_bin * nr_of_inchikeys if lowest_max_number_of_pairs < aimed_nr_of_pairs_per_bin: print(f"Warning: The average_pairs_per_bin: {settings.average_pairs_per_bin} cannot be reached, " @@ -158,13 +173,13 @@ def determine_aimed_nr_of_pairs_per_bin(available_pairs_per_bin_matrix, settings f"Instead the lowest number of available pairs in a bin times the resampling is used, " f"which is: {lowest_max_number_of_pairs}") aimed_nr_of_pairs_per_bin = lowest_max_number_of_pairs - return aimed_nr_of_pairs_per_bin + return aimed_nr_of_pairs_per_bin, bin_priority def balanced_selection_of_pairs_per_bin( available_pairs_per_bin_matrix: np.ndarray, max_pair_resampling: int, - nr_of_pairs_per_bin: int + nr_of_pairs_per_bin: int, ) -> np.ndarray: """From the available_pairs_per_bin_matrix a balanced selection is made to have a balanced distribution.