diff --git a/pyproject.toml b/pyproject.toml index 8c3a3cf3b1..1fb6699ec3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -154,6 +154,7 @@ test = [ "pytest-dependency", "pytest-cov", "psutil", + "scikit-learn", # preprocessing "ibllib>=3.4.1;python_version>='3.10'", @@ -167,7 +168,6 @@ test = [ # tridesclous2 "numba<0.61.0;python_version<'3.13'", "numba>=0.61.0;python_version>='3.13'", - "hdbscan>=0.8.33", # Previous version had a broken wheel # isosplit is needed for trideclous2 noramaly but isosplit is only build until python3.11 # so lets wait a new build of isosplit6 @@ -205,7 +205,6 @@ docs = [ # for notebooks in the gallery "MEArec", # Use as an example "pandas", # in the modules gallery comparison tutorial - "hdbscan>=0.8.33", # For sorters spykingcircus2 + tridesclous "numba", # For many postprocessing functions "networkx", "skops", # For automated curation diff --git a/src/spikeinterface/sorters/internal/simplesorter.py b/src/spikeinterface/sorters/internal/simplesorter.py index f24e965c2e..ac5e4fa5c2 100644 --- a/src/spikeinterface/sorters/internal/simplesorter.py +++ b/src/spikeinterface/sorters/internal/simplesorter.py @@ -39,7 +39,7 @@ class SimpleSorter(ComponentsBasedSorter): "method": "hdbscan", "min_cluster_size": 25, "allow_single_cluster": True, - "core_dist_n_jobs": -1, + "n_jobs": -1, "cluster_selection_method": "leaf", }, # "cache_preprocessing": {"mode": None, "memory_limit": 0.5, "delete_cache": True}, @@ -58,7 +58,7 @@ class SimpleSorter(ComponentsBasedSorter): "clustering": ( "A dictionary for specifying the clustering parameters: 'method' (to cluster) default: 'hdbscan', " "'min_cluster_size' (min number of spikes per cluster) default: 25, 'allow_single_cluster' default: True, " - " 'core_dist_n_jobs' (parallelization) default: -1, cluster_selection_method (for hdbscan) default: leaf" + " 'n_jobs' (parallelization) default: -1, cluster_selection_method (for hdbscan) default: leaf" ), "job_kwargs": "Spikeinterface job_kwargs (see job_kwargs documentation) default 'n_jobs': -1, 'chunk_duration': '1s'", } @@ -183,10 +183,10 @@ def _run_from_folder(cls, sorter_output_folder, params, verbose): clust_method = clust_params.pop("method", "hdbscan") if clust_method == "hdbscan": - import hdbscan + from sklearn.cluster import HDBSCAN - out = hdbscan.hdbscan(features_flat, **clust_params) - peak_labels = out[0] + model = HDBSCAN(**clust_params).fit(features_flat) + peak_labels = model.labels_.copy() elif clust_method == "hdbscan-gpu": from cuml.cluster import HDBSCAN as hdbscan diff --git a/src/spikeinterface/sortingcomponents/clustering/cleaning_tools.py b/src/spikeinterface/sortingcomponents/clustering/cleaning_tools.py index 8028761ccb..14d0444343 100644 --- a/src/spikeinterface/sortingcomponents/clustering/cleaning_tools.py +++ b/src/spikeinterface/sortingcomponents/clustering/cleaning_tools.py @@ -15,7 +15,7 @@ def _split_waveforms( wfs_and_noise, noise_size, n_components_by_channel, n_components, hdbscan_params, probability_thr, debug ): import sklearn.decomposition - import hdbscan + from sklearn.cluster import HDBSCAN valid_size = wfs_and_noise.shape[0] - noise_size @@ -30,9 +30,9 @@ def _split_waveforms( local_feature = pca.fit_transform(local_feature) # hdbscan on pca - clustering = hdbscan.hdbscan(local_feature, **hdbscan_params) - local_labels_with_noise = clustering[0] - cluster_probability = clustering[2] + clustering = HDBSCAN(**hdbscan_params).fit(local_feature) + local_labels_with_noise = clustering.labels_ + cluster_probability = clustering.probabilities_ (persistent_clusters,) = np.nonzero(cluster_probability > probability_thr) local_labels_with_noise[~np.isin(local_labels_with_noise, persistent_clusters)] = -1 @@ -95,7 +95,7 @@ def _split_waveforms_nested( wfs_and_noise, noise_size, nbefore, n_components_by_channel, n_components, hdbscan_params, probability_thr, debug ): import sklearn.decomposition - import hdbscan + from sklearn.cluster import HDBSCAN valid_size = wfs_and_noise.shape[0] - noise_size @@ -123,10 +123,10 @@ def _split_waveforms_nested( # ~ local_feature = pca.fit_transform(local_feature) # hdbscan on pca - clustering = hdbscan.hdbscan(local_feature, **hdbscan_params) - active_labels_with_noise = clustering[0] - cluster_probability = clustering[2] - (persistent_clusters,) = np.nonzero(clustering[2] > probability_thr) + clustering = HDBSCAN(**hdbscan_params).fit(local_feature) + active_labels_with_noise = clustering.labels_ + cluster_probability = clustering.probabilities_ + (persistent_clusters,) = np.nonzero(cluster_probability > probability_thr) active_labels_with_noise[~np.isin(active_labels_with_noise, persistent_clusters)] = -1 active_labels = active_labels_with_noise[active_ind < valid_size] @@ -233,7 +233,7 @@ def auto_split_clustering( """ import sklearn.decomposition - import hdbscan + from sklearn.cluster import HDBSCAN split_peak_labels = -1 * np.ones(peak_labels.size, dtype=np.int64) nb_clusters = 0 diff --git a/src/spikeinterface/sortingcomponents/clustering/graph_clustering.py b/src/spikeinterface/sortingcomponents/clustering/graph_clustering.py index f62a81bade..ef2a42259c 100644 --- a/src/spikeinterface/sortingcomponents/clustering/graph_clustering.py +++ b/src/spikeinterface/sortingcomponents/clustering/graph_clustering.py @@ -40,7 +40,7 @@ class GraphClustering: "clusterer": dict( method="sknetwork-louvain", # min_samples=1, - # core_dist_n_jobs=-1, + # n_jobs=-1, # min_cluster_size=20, # cluster_selection_method='leaf', # allow_single_cluster=True, @@ -165,7 +165,7 @@ def main_function(cls, recording, peaks, params, job_kwargs=dict()): _remove_small_cluster(peak_labels, min_size=1) elif clustering_method == "hdbscan": - from hdbscan import HDBSCAN + from sklearn.cluster import HDBSCAN import scipy.sparse n_graph, connected_labels = scipy.sparse.csgraph.connected_components(distances, directed=False) diff --git a/src/spikeinterface/sortingcomponents/clustering/itersplit_tools.py b/src/spikeinterface/sortingcomponents/clustering/itersplit_tools.py index 0411166de6..6a8783fb76 100644 --- a/src/spikeinterface/sortingcomponents/clustering/itersplit_tools.py +++ b/src/spikeinterface/sortingcomponents/clustering/itersplit_tools.py @@ -291,9 +291,9 @@ def split( tsvd = None if clusterer_method == "hdbscan": - from hdbscan import HDBSCAN + from sklearn.cluster import HDBSCAN - clustering_kwargs.update(core_dist_n_jobs=1) + clustering_kwargs.update(n_jobs=1) clust = HDBSCAN(**clustering_kwargs) with warnings.catch_warnings(): warnings.filterwarnings("ignore") diff --git a/src/spikeinterface/sortingcomponents/clustering/positions.py b/src/spikeinterface/sortingcomponents/clustering/positions.py index 6fe2400c56..232b11edc2 100644 --- a/src/spikeinterface/sortingcomponents/clustering/positions.py +++ b/src/spikeinterface/sortingcomponents/clustering/positions.py @@ -4,13 +4,14 @@ from pathlib import Path import numpy as np +import importlib.util -try: - import hdbscan - - HAVE_HDBSCAN = True -except: - HAVE_HDBSCAN = False +sklearn_spec = importlib.util.find_spec("sklearn") +if sklearn_spec is not None: + HAVE_SKLEARN = True + from sklearn.cluster import HDBSCAN +else: + HAVE_SKLEARN = False class PositionsClustering: @@ -21,7 +22,7 @@ class PositionsClustering: _default_params = { "peak_locations": None, "peak_localization_kwargs": {"method": "center_of_mass"}, - "hdbscan_kwargs": {"min_cluster_size": 20, "allow_single_cluster": True, "core_dist_n_jobs": -1}, + "hdbscan_kwargs": {"min_cluster_size": 20, "allow_single_cluster": True, "n_jobs": -1}, } name = "hdbscan_positions" @@ -36,8 +37,7 @@ class PositionsClustering: @classmethod def main_function(cls, recording, peaks, params, job_kwargs=dict()): - assert HAVE_HDBSCAN, "position clustering need hdbscan to be installed" - + assert HAVE_SKLEARN, "position clustering need sklearn to be installed" if params["peak_locations"] is None: from spikeinterface.sortingcomponents.peak_localization import localize_peaks @@ -50,8 +50,8 @@ def main_function(cls, recording, peaks, params, job_kwargs=dict()): location_keys = ["x", "y"] locations = np.stack([peak_locations[k] for k in location_keys], axis=1) - clustering = hdbscan.hdbscan(locations, **params["hdbscan_kwargs"]) - peak_labels = clustering[0] + clustering = HDBSCAN(**params["hdbscan_kwargs"]).fit(locations) + peak_labels = clustering.labels_ labels = np.unique(peak_labels) labels = labels[labels >= 0] diff --git a/src/spikeinterface/sortingcomponents/clustering/random_projections.py b/src/spikeinterface/sortingcomponents/clustering/random_projections.py index ac059e22ea..7f3b37bc67 100644 --- a/src/spikeinterface/sortingcomponents/clustering/random_projections.py +++ b/src/spikeinterface/sortingcomponents/clustering/random_projections.py @@ -6,12 +6,15 @@ import importlib import numpy as np -hdbscan_spec = importlib.util.find_spec("hdbscan") -if hdbscan_spec is not None: - HAVE_HDBSCAN = True - import hdbscan + +import importlib.util + +sklearn_spec = importlib.util.find_spec("sklearn") +if sklearn_spec is not None: + HAVE_SKLEARN = True + from sklearn.cluster import HDBSCAN else: - HAVE_HDBSCAN = False + HAVE_SKLEARN = False from spikeinterface.core.basesorting import minimum_spike_dtype from spikeinterface.core.waveform_tools import estimate_templates @@ -35,7 +38,7 @@ class RandomProjectionClustering: "clusterer": { "min_cluster_size": 10, "allow_single_cluster": True, - "core_dist_n_jobs": -1, + "n_jobs": -1, "cluster_selection_method": "eom", }, "waveforms": {"ms_before": 0.5, "ms_after": 1.5}, @@ -56,8 +59,7 @@ class RandomProjectionClustering: @classmethod def main_function(cls, recording, peaks, params, job_kwargs=dict()): - assert HAVE_HDBSCAN, "random projections clustering need hdbscan to be installed" - + assert HAVE_SKLEARN, "position clustering need sklearn to be installed" fs = recording.get_sampling_frequency() radius_um = params.get("radius_um", 30) ms_before = params["waveforms"].get("ms_before", 0.5) @@ -105,8 +107,8 @@ def main_function(cls, recording, peaks, params, job_kwargs=dict()): recording, pipeline_nodes, job_kwargs=job_kwargs, job_name="extracting features", verbose=verbose ) - clustering = hdbscan.hdbscan(hdbscan_data, **params["clusterer"]) - peak_labels = clustering[0] + clustering = HDBSCAN(**params["clusterer"]).fit(hdbscan_data) + peak_labels = clustering.labels_ labels = np.unique(peak_labels) labels = labels[labels >= 0]