From 345cf0d773bc53b9883724e4b5a149edd4cd145c Mon Sep 17 00:00:00 2001
From: Florian Huber <florian.huber@hs-duesseldorf.de>
Date: Tue, 15 Oct 2024 10:23:43 +0200
Subject: [PATCH 1/4] sort bins for evaluation

---
 ms2deepscore/models/loss_functions.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ms2deepscore/models/loss_functions.py b/ms2deepscore/models/loss_functions.py
index 9a2ae2b0..62513595 100644
--- a/ms2deepscore/models/loss_functions.py
+++ b/ms2deepscore/models/loss_functions.py
@@ -109,6 +109,10 @@ def bin_dependent_losses(predictions,
     """
     if predictions.shape != true_values.shape:
         raise ValueError("Expected true values and predictions to have the same shape")
+    
+    # Make sure bins are sorted
+    ref_score_bins = sorted(ref_score_bins, key=lambda x: x[0])
+
     bin_content = []
     losses = {"bin": []}
     for loss_type in loss_types:

From eb79dc6186906d92a2cef6ac9f77847b1d360895 Mon Sep 17 00:00:00 2001
From: Florian Huber <florian.huber@hs-duesseldorf.de>
Date: Tue, 15 Oct 2024 11:03:38 +0200
Subject: [PATCH 2/4] sort bins by occupation

---
 .../inchikey_pair_selection.py                | 49 +++++++++++++------
 1 file changed, 35 insertions(+), 14 deletions(-)

diff --git a/ms2deepscore/train_new_model/inchikey_pair_selection.py b/ms2deepscore/train_new_model/inchikey_pair_selection.py
index 7da11638..e70b03a5 100644
--- a/ms2deepscore/train_new_model/inchikey_pair_selection.py
+++ b/ms2deepscore/train_new_model/inchikey_pair_selection.py
@@ -39,16 +39,24 @@ def select_compound_pairs_wrapper(
         settings.same_prob_bins,
         settings.include_diagonal)
 
-    aimed_nr_of_pairs_per_bin = determine_aimed_nr_of_pairs_per_bin(available_pairs_per_bin_matrix,
-                                                                    settings,
-                                                                    nr_of_inchikeys=len(inchikeys14_unique))
-
-    pair_frequency_matrixes = balanced_selection_of_pairs_per_bin(available_pairs_per_bin_matrix,
-                                                                  settings.max_pair_resampling,
-                                                                  aimed_nr_of_pairs_per_bin)
-
-    selected_pairs_per_bin = convert_to_selected_pairs_list(pair_frequency_matrixes, available_pairs_per_bin_matrix,
-                                                            available_scores_per_bin_matrix, inchikeys14_unique)
+    aimed_nr_of_pairs_per_bin, bin_priorities = determine_aimed_nr_of_pairs_per_bin(
+        available_pairs_per_bin_matrix,
+        settings,
+        nr_of_inchikeys=len(inchikeys14_unique)
+        )
+
+    pair_frequency_matrixes = balanced_selection_of_pairs_per_bin(
+        available_pairs_per_bin_matrix,
+        settings.max_pair_resampling,
+        aimed_nr_of_pairs_per_bin
+        )
+
+    selected_pairs_per_bin = convert_to_selected_pairs_list(
+        pair_frequency_matrixes,
+        available_pairs_per_bin_matrix,
+        available_scores_per_bin_matrix,
+        inchikeys14_unique
+        )
     return [pair for pairs in selected_pairs_per_bin for pair in pairs]
 
 
@@ -143,12 +151,19 @@ def compute_jaccard_similarity_per_bin(
 
 def determine_aimed_nr_of_pairs_per_bin(available_pairs_per_bin_matrix, settings, nr_of_inchikeys):
     """Determines the aimed_nr_of_pairs_per_bin.
-    If the settings given are higher than the highest possible number of pairs it is lowered to that"""
+
+    If the settings given are higher than the highest possible number of pairs it is lowered to that.
+    """
 
     # Select the nr_of_pairs_per_bin to use
     nr_of_available_pairs_per_bin = get_nr_of_available_pairs_in_bin(available_pairs_per_bin_matrix)
     lowest_max_number_of_pairs = min(nr_of_available_pairs_per_bin) * settings.max_pair_resampling
     print(f"The available nr of pairs per bin are: {nr_of_available_pairs_per_bin}")
+
+    # Set bin priority from lowest to highest no. of available pairs
+    bin_priority = np.argsort(nr_of_available_pairs_per_bin)
+    print(f"Bin priorities will be orderd accordingly: {[settings.same_prob_bins[i] for i in bin_priority]}")
+
     aimed_nr_of_pairs_per_bin = settings.average_pairs_per_bin * nr_of_inchikeys
     if lowest_max_number_of_pairs < aimed_nr_of_pairs_per_bin:
         print(f"Warning: The average_pairs_per_bin: {settings.average_pairs_per_bin} cannot be reached, "
@@ -158,13 +173,14 @@ def determine_aimed_nr_of_pairs_per_bin(available_pairs_per_bin_matrix, settings
               f"Instead the lowest number of available pairs in a bin times the resampling is used, "
               f"which is: {lowest_max_number_of_pairs}")
         aimed_nr_of_pairs_per_bin = lowest_max_number_of_pairs
-    return aimed_nr_of_pairs_per_bin
+    return aimed_nr_of_pairs_per_bin, bin_priority
 
 
 def balanced_selection_of_pairs_per_bin(
         available_pairs_per_bin_matrix: np.ndarray,
         max_pair_resampling: int,
-        nr_of_pairs_per_bin: int
+        nr_of_pairs_per_bin: int,
+        bin_priority: np.ndarray = None,
         ) -> np.ndarray:
     """From the available_pairs_per_bin_matrix a balanced selection is made to have a balanced distribution.
 
@@ -190,11 +206,16 @@ def balanced_selection_of_pairs_per_bin(
         Resampling means that the exact same inchikey pair is added multiple times to the list of pairs.
     nr_of_pairs_per_bin:
         The number of pairs that should be sampled for each tanimoto bin.
+    bin_priority:
+        Bins will be processed in the order given in bin_priority. Default is set to None in which case no change
+        to the order will be done.
     """
+    if bin_priority is None:
+        bin_priority = np.arange(0, available_pairs_per_bin_matrix.shape[0])
 
     inchikey_count = np.zeros(available_pairs_per_bin_matrix.shape[1])
     pair_frequency_matrixes = []
-    for pairs_in_bin in available_pairs_per_bin_matrix:
+    for pairs_in_bin in available_pairs_per_bin_matrix[bin_priority]:
         pair_frequencies, inchikey_count = select_balanced_pairs(pairs_in_bin,
                                                                  inchikey_count,
                                                                  nr_of_pairs_per_bin,

From bd5837dbec219b4e5f9470797637418db4965e07 Mon Sep 17 00:00:00 2001
From: Florian Huber <florian.huber@hs-duesseldorf.de>
Date: Tue, 15 Oct 2024 11:40:35 +0200
Subject: [PATCH 3/4] add occupation based bin ordering

---
 .../train_new_model/inchikey_pair_selection.py   | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/ms2deepscore/train_new_model/inchikey_pair_selection.py b/ms2deepscore/train_new_model/inchikey_pair_selection.py
index e70b03a5..d18da37e 100644
--- a/ms2deepscore/train_new_model/inchikey_pair_selection.py
+++ b/ms2deepscore/train_new_model/inchikey_pair_selection.py
@@ -46,15 +46,15 @@ def select_compound_pairs_wrapper(
         )
 
     pair_frequency_matrixes = balanced_selection_of_pairs_per_bin(
-        available_pairs_per_bin_matrix,
+        available_pairs_per_bin_matrix[bin_priorities, :],
         settings.max_pair_resampling,
-        aimed_nr_of_pairs_per_bin
+        aimed_nr_of_pairs_per_bin,
         )
 
     selected_pairs_per_bin = convert_to_selected_pairs_list(
         pair_frequency_matrixes,
-        available_pairs_per_bin_matrix,
-        available_scores_per_bin_matrix,
+        available_pairs_per_bin_matrix[bin_priorities, :],
+        available_scores_per_bin_matrix[bin_priorities, :],
         inchikeys14_unique
         )
     return [pair for pairs in selected_pairs_per_bin for pair in pairs]
@@ -180,7 +180,6 @@ def balanced_selection_of_pairs_per_bin(
         available_pairs_per_bin_matrix: np.ndarray,
         max_pair_resampling: int,
         nr_of_pairs_per_bin: int,
-        bin_priority: np.ndarray = None,
         ) -> np.ndarray:
     """From the available_pairs_per_bin_matrix a balanced selection is made to have a balanced distribution.
 
@@ -206,16 +205,11 @@ def balanced_selection_of_pairs_per_bin(
         Resampling means that the exact same inchikey pair is added multiple times to the list of pairs.
     nr_of_pairs_per_bin:
         The number of pairs that should be sampled for each tanimoto bin.
-    bin_priority:
-        Bins will be processed in the order given in bin_priority. Default is set to None in which case no change
-        to the order will be done.
     """
-    if bin_priority is None:
-        bin_priority = np.arange(0, available_pairs_per_bin_matrix.shape[0])
 
     inchikey_count = np.zeros(available_pairs_per_bin_matrix.shape[1])
     pair_frequency_matrixes = []
-    for pairs_in_bin in available_pairs_per_bin_matrix[bin_priority]:
+    for pairs_in_bin in available_pairs_per_bin_matrix:
         pair_frequencies, inchikey_count = select_balanced_pairs(pairs_in_bin,
                                                                  inchikey_count,
                                                                  nr_of_pairs_per_bin,

From c66ae9a3eb0fb2853cace2c6c9c32b4194e388f5 Mon Sep 17 00:00:00 2001
From: Florian Huber <florian.huber@hs-duesseldorf.de>
Date: Tue, 15 Oct 2024 11:40:56 +0200
Subject: [PATCH 4/4] remove pylint remains

---
 ms2deepscore/MS2DeepScoreMonteCarlo.py      | 1 -
 ms2deepscore/benchmarking/plot_ridgeline.py | 2 --
 ms2deepscore/tensorize_spectra.py           | 1 -
 3 files changed, 4 deletions(-)

diff --git a/ms2deepscore/MS2DeepScoreMonteCarlo.py b/ms2deepscore/MS2DeepScoreMonteCarlo.py
index 21ad8618..7702248c 100644
--- a/ms2deepscore/MS2DeepScoreMonteCarlo.py
+++ b/ms2deepscore/MS2DeepScoreMonteCarlo.py
@@ -76,7 +76,6 @@ def __init__(self, model,
             Set to True to monitor the embedding creating with a progress bar.
             Default is False.
         """
-        # pylint: disable=too-many-arguments
         self.model = model
         if self.model.encoder.dropout.p == 0:
             raise TypeError("Monte Carlo Dropout is not supposed to be used with a model where dropout-rate=0.")
diff --git a/ms2deepscore/benchmarking/plot_ridgeline.py b/ms2deepscore/benchmarking/plot_ridgeline.py
index 04c7d9b6..6116f956 100644
--- a/ms2deepscore/benchmarking/plot_ridgeline.py
+++ b/ms2deepscore/benchmarking/plot_ridgeline.py
@@ -30,7 +30,6 @@ def create_combined_ridgeline_plot(reference_scores,
     compare_score_name
         Label string. The default is "MS2DeepScore".
     """
-    # pylint: disable=too-many-arguments
 
     histograms, used_bins, _, _ = calculate_histograms(reference_scores, comparison_scores,
                                                        n_bins, min_resolution, max_resolution)
@@ -115,7 +114,6 @@ def score_histogram(scores, n_bins, ax=None, ylabel="scores"):
 
 def calculate_histograms(reference_scores, comparison_scores, n_bins=10, min_resolution=20, max_resolution=100):
     """Calcualte a series of histograms, one for every bin."""
-    # pylint: disable=too-many-locals
     def get_hist_bins(resolution):
         hist_bins = np.linspace(0, 1, resolution)
         hist_bins = np.concatenate((hist_bins, np.array([2.0])))
diff --git a/ms2deepscore/tensorize_spectra.py b/ms2deepscore/tensorize_spectra.py
index 11cecad1..cb44abc4 100644
--- a/ms2deepscore/tensorize_spectra.py
+++ b/ms2deepscore/tensorize_spectra.py
@@ -33,7 +33,6 @@ def tensorize_spectra(
 @numba.jit(nopython=True)
 def vectorize_spectrum(mz_array, intensities_array, min_mz, max_mz, mz_bin_width, intensity_scaling):
     """Fast function to convert mz and intensity arrays into dense spectrum vector."""
-    # pylint: disable=too-many-arguments
     num_bins = int((max_mz - min_mz) / mz_bin_width)
     vector = np.zeros((num_bins))
     for mz, intensity in zip(mz_array, intensities_array):