From 45bd52967c2cd33ca507164c7e3c22f92473fca7 Mon Sep 17 00:00:00 2001 From: mamdouhJ Date: Fri, 19 Dec 2025 12:04:05 +0100 Subject: [PATCH 1/4] Added the ability to log losses in most deep clustering algorithms --- .../deep/_abstract_deep_clustering_algo.py | 15 ++++++++ clustpy/deep/_train_utils.py | 12 +++++-- clustpy/deep/aec.py | 29 +++++++++++---- clustpy/deep/dcn.py | 36 +++++++++++++------ clustpy/deep/ddc_n2d.py | 11 +++--- clustpy/deep/dec.py | 31 +++++++++++----- clustpy/deep/deepect.py | 31 +++++++++++----- clustpy/deep/den.py | 17 +++++++-- clustpy/deep/dkm.py | 29 ++++++++++----- .../neural_networks/_abstract_autoencoder.py | 11 ++++-- clustpy/deep/vade.py | 29 +++++++++------ 11 files changed, 187 insertions(+), 64 deletions(-) diff --git a/clustpy/deep/_abstract_deep_clustering_algo.py b/clustpy/deep/_abstract_deep_clustering_algo.py index c776ff6..a9e04ad 100644 --- a/clustpy/deep/_abstract_deep_clustering_algo.py +++ b/clustpy/deep/_abstract_deep_clustering_algo.py @@ -1,3 +1,4 @@ +from collections import defaultdict from clustpy.deep._utils import set_torch_seed from sklearn.base import TransformerMixin, BaseEstimator, ClusterMixin import numpy as np @@ -37,6 +38,20 @@ def __init__(self, batch_size: int, neural_network: torch.nn.Module | tuple, neu self.embedding_size = embedding_size self.device = device self.random_state = random_state + self.history_ = defaultdict(list) + + def _log_history(self, key: str, value) -> None: + """ + Log pretraining and clustering history values (e.g. loss values) during training. + + Parameters + ---------- + key : str + the key under which to store the value + value : float + + """ + self.history_[key].append(float(value)) def _check_parameters(self, X: np.ndarray, *, y: np.ndarray=None) -> (np.ndarray, np.ndarray, np.random.RandomState, dict, dict, dict): """ diff --git a/clustpy/deep/_train_utils.py b/clustpy/deep/_train_utils.py index e776d96..d04363c 100644 --- a/clustpy/deep/_train_utils.py +++ b/clustpy/deep/_train_utils.py @@ -111,7 +111,9 @@ def get_trained_network(trainloader: torch.utils.data.DataLoader = None, data: n ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error, embedding_size: int = 10, neural_network: torch.nn.Module | tuple = None, neural_network_class: torch.nn.Module = FeedforwardAutoencoder, - neural_network_params: dict = None, neural_network_weights: str = None, + neural_network_params: dict = None, + log_fn: Callable[[str, float], None] = None, + neural_network_weights: str = None, random_state: np.random.RandomState | int = None) -> torch.nn.Module: """This function returns a trained neural network. The following cases are considered - If the neural network is initialized and trained (neural_network.fitted==True), then return input neural network without training it again. @@ -147,6 +149,8 @@ def get_trained_network(trainloader: torch.utils.data.DataLoader = None, data: n The neural network class that should be used (default: FeedforwardAutoencoder) neural_network_params : dict Parameters to be used when creating a new neural network using the neural_network_class (default: None) + log_fn : Callable[[str, float], None] + Function to log pretraining information such as loss values. It has to take a string (key) and a float (value) as input parameters. neural_network_weights : str Path to a file containing the state_dict of the neural_network (default: None) random_state : np.random.RandomState | int @@ -171,7 +175,7 @@ def get_trained_network(trainloader: torch.utils.data.DataLoader = None, data: n # Pretrain neural network optimizer_params = {"lr": 1e-3} if optimizer_params is None else optimizer_params neural_network.fit(n_epochs=n_epochs, optimizer_params=optimizer_params, dataloader=trainloader, - optimizer_class=optimizer_class, ssl_loss_fn=ssl_loss_fn) + optimizer_class=optimizer_class, ssl_loss_fn=ssl_loss_fn,log_fn=log_fn) return neural_network @@ -185,6 +189,7 @@ def get_default_deep_clustering_initialization(X: np.ndarray | torch.Tensor, n_c random_state: np.random.RandomState, neural_network_class: torch.nn.Module = FeedforwardAutoencoder, neural_network_params: dict = None, + log_fn: Callable[[str, float], None] = None, neural_network_weights: str = None) -> ( torch.device, torch.utils.data.DataLoader, torch.utils.data.DataLoader, int, torch.nn.Module, np.ndarray, int, np.ndarray, np.ndarray, ClusterMixin): @@ -231,6 +236,8 @@ def get_default_deep_clustering_initialization(X: np.ndarray | torch.Tensor, n_c The neural network class that should be used (default: FeedforwardAutoencoder) neural_network_params : dict Parameters to be used when creating a new neural network using the neural_network_class (default: None) + log_fn : Callable[[str, float], None] + Function to log pretraining information such as loss values. It has to take a string (key) and a float (value) as input parameters. neural_network_weights : str Path to a file containing the state_dict of the neural_network (default: None) @@ -256,6 +263,7 @@ def get_default_deep_clustering_initialization(X: np.ndarray | torch.Tensor, n_c neural_network=neural_network, neural_network_class=neural_network_class, neural_network_params=neural_network_params, neural_network_weights=neural_network_weights, + log_fn=log_fn, random_state=random_state) # Execute initial clustering in embedded space embedded_data = encode_batchwise(testloader, neural_network) diff --git a/clustpy/deep/aec.py b/clustpy/deep/aec.py index fcf6e61..92b3eb6 100644 --- a/clustpy/deep/aec.py +++ b/clustpy/deep/aec.py @@ -21,7 +21,7 @@ def _aec(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_par embedding_size: int, clustering_loss_weight: float, ssl_loss_weight: float, custom_dataloaders: tuple, augmentation_invariance: bool, initial_clustering_class: ClusterMixin, initial_clustering_params: dict, device: torch.device, - random_state: np.random.RandomState) -> (np.ndarray, np.ndarray, np.ndarray, np.ndarray, torch.nn.Module): + log_fn: Callable | None,random_state: np.random.RandomState) -> (np.ndarray, np.ndarray, np.ndarray, np.ndarray, torch.nn.Module): """ Start the actual AEC clustering procedure on the input data set. @@ -70,9 +70,13 @@ def _aec(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_par parameters for the initial clustering class device : torch.device The device on which to perform the computations + log_fn : Callable | None + function for logging training history values (e.g. loss values) during training random_state : np.random.RandomState use a fixed random state to get a repeatable solution + + Returns ------- tuple : (np.ndarray, np.ndarray, np.ndarray, np.ndarray, torch.nn.Module) @@ -84,9 +88,9 @@ def _aec(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_par device, trainloader, testloader, _, neural_network, _, n_clusters, init_labels, init_centers, _ = get_default_deep_clustering_initialization( X, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn, neural_network, embedding_size, custom_dataloaders, initial_clustering_class, initial_clustering_params, device, - random_state, neural_network_weights=neural_network_weights) + random_state, log_fn=log_fn, neural_network_weights=neural_network_weights) # Setup AEC Module - aec_module = _AEC_Module(init_labels, init_centers, augmentation_invariance).to_device(device) + aec_module = _AEC_Module(init_labels, init_centers, augmentation_invariance,log_fn).to_device(device) # Use AEC optimizer parameters (usually learning rate is reduced by a magnitude of 10) optimizer = optimizer_class(list(neural_network.parameters()), **clustering_optimizer_params) # AEC Training loop @@ -120,11 +124,13 @@ class _AEC_Module(_DCN_Module): the cluster centers augmentation_invariance : bool Is augmentation invariance used + log_fn : Callable | None + function for logging training history values (e.g. loss values) during training """ def __init__(self, init_np_labels: np.ndarray, init_np_centers: np.ndarray, - augmentation_invariance: bool = False): - super().__init__(init_np_labels, init_np_centers, augmentation_invariance) + augmentation_invariance: bool = False, log_fn: Callable | None = None): + super().__init__(init_np_labels, init_np_centers, augmentation_invariance,log_fn) def update_centroids(self, embedded: np.ndarray, labels: np.ndarray) -> torch.Tensor: """ @@ -188,14 +194,18 @@ def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.Dat for _ in tbar: # Update Network total_loss = 0 + total_ssl_loss = 0 + total_clustering_loss = 0 for batch in trainloader: # Beware that the clustering loss of DCN is divided by 2, therefore we use 2 * clustering_loss_weight loss = self._loss(batch, neural_network, ssl_loss_fn, ssl_loss_weight, 2 * clustering_loss_weight, device) - total_loss += loss.item() + total_loss += loss[0].item() + total_ssl_loss += loss[1].item() + total_clustering_loss += loss[2].item() # Backward pass - update weights optimizer.zero_grad() - loss.backward() + loss[0].backward() optimizer.step() postfix_str = {"Loss": total_loss} tbar.set_postfix(postfix_str) @@ -207,6 +217,10 @@ def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.Dat # update assignments labels = self.predict_hard(torch.tensor(embedded).to(device)) self.labels = labels.to(device) + if self.log_fn is not None: + self.log_fn("Total Loss", total_loss) + self.log_fn("SSL Loss", total_ssl_loss) + self.log_fn("Clustering Loss", total_clustering_loss) return self @@ -349,6 +363,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'AEC': self.initial_clustering_class, initial_clustering_params, self.device, + self._log_history, random_state) self.labels_ = aec_labels self.cluster_centers_ = aec_centers diff --git a/clustpy/deep/dcn.py b/clustpy/deep/dcn.py index 7fcf906..79a4ccb 100644 --- a/clustpy/deep/dcn.py +++ b/clustpy/deep/dcn.py @@ -22,7 +22,8 @@ def _dcn(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_par embedding_size: int, clustering_loss_weight: float, ssl_loss_weight: float, custom_dataloaders: tuple, augmentation_invariance: bool, initial_clustering_class: ClusterMixin, initial_clustering_params: dict, device: torch.device, - random_state: np.random.RandomState) -> (np.ndarray, np.ndarray, np.ndarray, np.ndarray, torch.nn.Module): + random_state: np.random.RandomState, + log_fn: Callable | None) -> (np.ndarray, np.ndarray, np.ndarray, np.ndarray, torch.nn.Module): """ Start the actual DCN clustering procedure on the input data set. @@ -71,6 +72,8 @@ def _dcn(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_par parameters for the initial clustering class device : torch.device The device on which to perform the computations + log_fn : Callable | None + function for logging training history values (e.g. loss values) during training random_state : np.random.RandomState use a fixed random state to get a repeatable solution @@ -87,9 +90,9 @@ def _dcn(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_par device, trainloader, testloader, _, neural_network, _, n_clusters, init_labels, init_centers, _ = get_default_deep_clustering_initialization( X, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn, neural_network, embedding_size, custom_dataloaders, initial_clustering_class, initial_clustering_params, device, - random_state, neural_network_weights=neural_network_weights) + random_state, log_fn=log_fn, neural_network_weights=neural_network_weights) # Setup DCN Module - dcn_module = _DCN_Module(init_labels, init_centers, augmentation_invariance).to_device(device) + dcn_module = _DCN_Module(init_labels, init_centers, augmentation_invariance,log_fn).to_device(device) # Use DCN optimizer parameters (usually learning rate is reduced by a magnitude of 10) optimizer = optimizer_class(list(neural_network.parameters()), **clustering_optimizer_params) # DEC Training loop @@ -152,6 +155,8 @@ class _DCN_Module(torch.nn.Module): augmentation_invariance : bool If True, augmented samples provided in custom_dataloaders[0] will be used to learn cluster assignments that are invariant to the augmentation transformations (default: False) + log_fn : Callable | None + function for logging training history values (e.g. loss values) during training Attributes ---------- @@ -163,7 +168,7 @@ class _DCN_Module(torch.nn.Module): Is augmentation invariance used """ - def __init__(self, init_np_labels: np.ndarray, init_np_centers: np.ndarray, augmentation_invariance: bool = False): + def __init__(self, init_np_labels: np.ndarray, init_np_centers: np.ndarray, augmentation_invariance: bool = False,log_fn: Callable | None=None): super().__init__() self.augmentation_invariance = augmentation_invariance self.labels = torch.from_numpy(init_np_labels) @@ -171,6 +176,7 @@ def __init__(self, init_np_labels: np.ndarray, init_np_centers: np.ndarray, augm # Init for count from original DCN code (not reported in Paper) # This means centroid learning rate at the beginning is scaled by a hundred self.counts = torch.ones(self.centers.shape[0], dtype=torch.int32) * 100 + self.log_fn = log_fn def dcn_loss(self, embedded: torch.Tensor, labels: torch.Tensor) -> torch.Tensor: """ @@ -188,7 +194,7 @@ def dcn_loss(self, embedded: torch.Tensor, labels: torch.Tensor) -> torch.Tensor loss: torch.Tensor the final DCN loss """ - loss = (embedded - self.centers[labels]).pow(2).sum() / embedded.shape[0] + loss = (embedded - self.centers[labels.long()]).pow(2).sum() / embedded.shape[0] return loss def predict_hard(self, embedded: torch.Tensor) -> torch.Tensor: @@ -266,6 +272,8 @@ def _loss(self, batch: list, neural_network: torch.nn.Module, ssl_loss_fn: Calla weight of the clustering loss ssl_loss_weight : float weight of the self-supervised learning (ssl) loss + log_fn : Callable | None + function for logging training history values (e.g. loss values) during training device : torch.device device to be trained on @@ -290,8 +298,7 @@ def _loss(self, batch: list, neural_network: torch.nn.Module, ssl_loss_fn: Calla # compute total loss loss = ssl_loss_weight * ssl_loss + 0.5 * clustering_loss_weight * cluster_loss - - return loss + return loss, ssl_loss, cluster_loss def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.DataLoader, testloader: torch.utils.data.DataLoader, n_epochs: int, device: torch.device, @@ -331,13 +338,17 @@ def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.Dat for _ in tbar: # Update Network total_loss = 0 + ssl_loss = 0 + clustering_loss = 0 for batch in trainloader: loss = self._loss(batch, neural_network, ssl_loss_fn, ssl_loss_weight, clustering_loss_weight, device) - total_loss += loss.item() + total_loss += loss[0].item() + ssl_loss += loss[1].item() + clustering_loss += loss[2].item() # Backward pass - update weights optimizer.zero_grad() - loss.backward() + loss[0].backward() optimizer.step() # Update Assignments and Centroids with torch.no_grad(): @@ -362,6 +373,10 @@ def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.Dat self.counts = counts postfix_str = {"Loss": total_loss} tbar.set_postfix(postfix_str) + if self.log_fn is not None: + self.log_fn("total_loss", total_loss) + self.log_fn("ssl_loss", ssl_loss) + self.log_fn("clustering_loss", clustering_loss) return self @@ -509,7 +524,8 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DCN': self.initial_clustering_class, initial_clustering_params, self.device, - random_state) + random_state, + log_fn=self._log_history) self.labels_ = kmeans_labels self.cluster_centers_ = kmeans_centers self.dcn_labels_ = dcn_labels diff --git a/clustpy/deep/ddc_n2d.py b/clustpy/deep/ddc_n2d.py index 6cde45b..291bf6b 100644 --- a/clustpy/deep/ddc_n2d.py +++ b/clustpy/deep/ddc_n2d.py @@ -24,7 +24,7 @@ def _manifold_based_sequential_dc(X: np.ndarray, n_clusters: int, batch_size: in neural_network_weights: str, embedding_size: int, custom_dataloaders: tuple, manifold_class: TransformerMixin, manifold_params: dict, clustering_class: ClusterMixin, clustering_params: dict, device: torch.device, - random_state: np.random.RandomState) -> ( + random_state: np.random.RandomState, log_fn: Callable | None ) -> ( int, np.ndarray, np.ndarray, torch.nn.Module, TransformerMixin): """ Execute a manifold-based sequential deep clustering procedure on the input data set. @@ -69,6 +69,8 @@ def _manifold_based_sequential_dc(X: np.ndarray, n_clusters: int, batch_size: in The device on which to perform the computations random_state : np.random.RandomState use a fixed random state to get a repeatable solution + log_fn : Callable | None + function for logging training history values (e.g. loss values) during training Returns ------- @@ -88,7 +90,7 @@ def _manifold_based_sequential_dc(X: np.ndarray, n_clusters: int, batch_size: in optimizer_params=pretrain_optimizer_params, optimizer_class=optimizer_class, device=device, ssl_loss_fn=ssl_loss_fn, embedding_size=embedding_size, neural_network=neural_network, neural_network_weights=neural_network_weights, - random_state=random_state) + log_fn=log_fn, random_state=random_state) # Encode data X_embed = encode_batchwise(testloader, neural_network) # Get possible input parameters of the manifold class @@ -356,7 +358,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DDC': tsne_params, DDC_density_peak_clustering, {"ratio": self.ratio}, self.device, - random_state) + random_state,self._log_history) self.labels_ = labels self.n_clusters_ = n_clusters self.cluster_centers_ = centers_ae @@ -503,7 +505,8 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'N2D': manifold_params, GMM, initial_clustering_params, self.device, - random_state) + random_state, + self._log_history) self.labels_ = labels.astype(np.int32) self.cluster_centers_manifold_ = centers_manifold self.cluster_centers_ = centers_ae diff --git a/clustpy/deep/dec.py b/clustpy/deep/dec.py index a2501fb..b7518ee 100644 --- a/clustpy/deep/dec.py +++ b/clustpy/deep/dec.py @@ -22,7 +22,8 @@ def _dec(X: np.ndarray, n_clusters: int, alpha: float, batch_size: int, pretrain neural_network: torch.nn.Module | tuple, neural_network_weights: str, embedding_size: int, clustering_loss_weight: float, ssl_loss_weight: float, custom_dataloaders: tuple, augmentation_invariance: bool, initial_clustering_class: ClusterMixin, initial_clustering_params: dict, - device: torch.device, random_state: np.random.RandomState) -> ( + device: torch.device, random_state: np.random.RandomState, + log_fn: Callable | None) -> ( np.ndarray, np.ndarray, np.ndarray, np.ndarray, torch.nn.Module): """ Start the actual DEC clustering procedure on the input data set. @@ -90,9 +91,9 @@ def _dec(X: np.ndarray, n_clusters: int, alpha: float, batch_size: int, pretrain device, trainloader, testloader, _, neural_network, _, n_clusters, _, init_centers, _ = get_default_deep_clustering_initialization( X, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn, neural_network, embedding_size, custom_dataloaders, initial_clustering_class, initial_clustering_params, device, - random_state, neural_network_weights=neural_network_weights) + random_state, log_fn=log_fn, neural_network_weights=neural_network_weights) # Setup DEC Module - dec_module = _DEC_Module(init_centers, alpha, augmentation_invariance).to(device) + dec_module = _DEC_Module(init_centers, alpha, augmentation_invariance,log_fn).to(device) # Use DEC optimizer parameters (usually learning rate is reduced by a magnitude of 10) optimizer = optimizer_class(list(neural_network.parameters()) + list(dec_module.parameters()), **clustering_optimizer_params) @@ -205,10 +206,11 @@ class _DEC_Module(torch.nn.Module): Is augmentation invariance used """ - def __init__(self, init_centers: np.ndarray, alpha: float, augmentation_invariance: bool = False): + def __init__(self, init_centers: np.ndarray, alpha: float, augmentation_invariance: bool = False,log_fn: Callable | None = None): super().__init__() self.alpha = alpha self.augmentation_invariance = augmentation_invariance + self.log_fn = log_fn # Centers are learnable parameters self.centers = torch.nn.Parameter(torch.tensor(init_centers), requires_grad=True) @@ -331,6 +333,8 @@ def _loss(self, batch: list, neural_network: torch.nn.Module, clustering_loss_we the final DEC loss """ loss = torch.tensor(0.).to(device) + ssl_loss = torch.tensor(0.).to(device) + cluster_loss = torch.tensor(0.).to(device) # Reconstruction loss is not included in DEC if ssl_loss_weight != 0: if self.augmentation_invariance: @@ -355,7 +359,7 @@ def _loss(self, batch: list, neural_network: torch.nn.Module, clustering_loss_we cluster_loss = self.dec_loss(embedded) loss += cluster_loss * clustering_loss_weight - return loss + return loss, ssl_loss, cluster_loss def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.DataLoader, n_epochs: int, device: torch.device, optimizer: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, @@ -390,16 +394,26 @@ def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.Dat tbar = tqdm.trange(n_epochs, desc="DEC training") for _ in tbar: total_loss = 0 + total_ssl_loss = 0 + total_cluster_loss = 0 for batch in trainloader: loss = self._loss(batch, neural_network, clustering_loss_weight, ssl_loss_weight, ssl_loss_fn, device) - total_loss += loss.item() + total_loss += loss[0].item() + total_ssl_loss += loss[1].item() if ssl_loss_weight != 0 else 0 + total_cluster_loss += loss[2].item() + # Backward pass optimizer.zero_grad() - loss.backward() + loss[0].backward() optimizer.step() postfix_str = {"Loss": total_loss} tbar.set_postfix(postfix_str) + if self.log_fn is not None: + self.log_fn("Total Loss", total_loss) + if ssl_loss_weight != 0: + self.log_fn("SSL Loss", total_ssl_loss) + self.log_fn("Clustering Loss", total_cluster_loss) return self @@ -547,7 +561,8 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DEC': self.augmentation_invariance, self.initial_clustering_class, initial_clustering_params, - self.device, random_state) + self.device, random_state, + log_fn=self._log_history) self.labels_ = kmeans_labels self.cluster_centers_ = kmeans_centers self.dec_labels_ = dec_labels diff --git a/clustpy/deep/deepect.py b/clustpy/deep/deepect.py index 028afbb..86da71b 100644 --- a/clustpy/deep/deepect.py +++ b/clustpy/deep/deepect.py @@ -79,10 +79,12 @@ class _DeepECT_Module(torch.nn.Module): augmentation_invariance : bool If True, augmented samples provided in custom_dataloaders[0] will be used to learn cluster assignments that are invariant to the augmentation transformations (default: False) + log_fn : Callable | None + function for logging training history values (e.g. loss values) during training """ def __init__(self, cluster_tree: BinaryClusterTree, max_n_leaf_nodes: int, grow_interval: int, - pruning_threshold: float, augmentation_invariance: bool = False): + pruning_threshold: float, augmentation_invariance: bool = False,log_fn: Callable | None = None): super().__init__() # Create initial cluster tree self.cluster_tree = cluster_tree @@ -90,6 +92,7 @@ def __init__(self, cluster_tree: BinaryClusterTree, max_n_leaf_nodes: int, grow_ self.grow_interval = grow_interval self.pruning_threshold = pruning_threshold self.augmentation_invariance = augmentation_invariance + self.log_fn = log_fn def predict_hard(self, embedded: torch.Tensor) -> torch.Tensor: """ @@ -137,7 +140,7 @@ def _get_labels_from_leafs(self, embedded: torch.Tensor, leaf_nodes: list) -> ( leaf_labels = torch.stack([leaf.torch_labels[0] for leaf in leaf_nodes]) # Get distances between points and centers. Get nearest center squared_diffs = squared_euclidean_distance(embedded, leaf_centers) - cluster_center_assignments = (squared_diffs.min(dim=1)[1]).int() + cluster_center_assignments = (squared_diffs.min(dim=1)[1]).long() labels = leaf_labels[cluster_center_assignments] return leaf_centers, cluster_center_assignments, labels @@ -372,7 +375,7 @@ def _loss(self, batch: list, neural_network: torch.nn.Module, ssl_loss_fn: Calla dc_loss = self._data_compression_loss(embedded, split_nodes, labels, device, embedded_aug) # Combine losses loss = clustering_loss_weight * (nc_loss + dc_loss) + ssl_loss_weight * ssl_loss - return loss, labels + return (loss, ssl_loss, nc_loss+dc_loss), labels def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.DataLoader, testloader: torch.utils.data.DataLoader, n_epochs: int, device: torch.device, @@ -415,6 +418,8 @@ def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.Dat for epoch in tbar: # Update Network total_loss = 0 + total_ssl_loss = 0 + total_clust_loss = 0 with torch.no_grad(): # Grow tree if (epoch % self.grow_interval == 0 or self.cluster_tree.n_leaf_nodes_ < 2) and len( @@ -426,10 +431,12 @@ def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.Dat # Calculate loss loss, labels = self._loss(batch, neural_network, ssl_loss_fn, clustering_loss_weight, ssl_loss_weight, leaf_nodes, split_nodes, device) - total_loss += loss.item() + total_loss += loss[0].item() + total_ssl_loss += loss[1].item() + total_clust_loss += loss[2].item() # Backward pass - update weights optimizer.zero_grad() - loss.backward() + loss[0].backward() optimizer.step() # Adapt centers and weights of split nodes analytically with torch.no_grad(): @@ -440,6 +447,10 @@ def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.Dat leaf_nodes, split_nodes = self.cluster_tree.get_leaf_and_split_nodes() postfix_str = {"Loss": total_loss} tbar.set_postfix(postfix_str) + if self.log_fn is not None: + self.log_fn("Total Loss", total_loss) + self.log_fn("SSL Loss", total_ssl_loss) + self.log_fn("Clustering Loss", total_clust_loss) return self @@ -449,7 +460,7 @@ def _deep_ect(X: np.ndarray, max_n_leaf_nodes: int, batch_size: int, pretrain_op ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, neural_network: torch.nn.Module | tuple, neural_network_weights: str, embedding_size: int, clustering_loss_weight: float, ssl_loss_weight: float, custom_dataloaders: tuple, augmentation_invariance: bool, device: torch.device, - random_state: np.random.RandomState) -> (np.ndarray, np.ndarray, torch.nn.Module): + log_fn: Callable | None,random_state: np.random.RandomState) -> (np.ndarray, np.ndarray, torch.nn.Module): """ Start the actual DeepECT clustering procedure on the input data set. @@ -497,6 +508,8 @@ def _deep_ect(X: np.ndarray, max_n_leaf_nodes: int, batch_size: int, pretrain_op If True, augmented samples provided in custom_dataloaders[0] will be used to learn cluster assignments that are invariant to the augmentation transformations device : torch.device The device on which to perform the computations + log_fn : Callable | None + function for logging training history values (e.g. loss values) during training random_state : np.random.RandomState use a fixed random state to get a repeatable solution @@ -511,11 +524,11 @@ def _deep_ect(X: np.ndarray, max_n_leaf_nodes: int, batch_size: int, pretrain_op device, trainloader, testloader, _, neural_network, _, _, _, init_leafnode_centers, _ = get_default_deep_clustering_initialization( X, 2, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn, neural_network, embedding_size, custom_dataloaders, KMeans, {"n_init": 20}, device, - random_state, neural_network_weights=neural_network_weights) + random_state, log_fn=log_fn, neural_network_weights=neural_network_weights) cluster_tree = BinaryClusterTree(_DeepECT_ClusterTreeNode) # Setup DeepECT Module deepect_module = _DeepECT_Module(cluster_tree, max_n_leaf_nodes, grow_interval, pruning_threshold, - augmentation_invariance).to(device) + augmentation_invariance,log_fn).to(device) # Use DeepECT optimizer parameters (usually learning rate is reduced by a magnitude of 10) optimizer = optimizer_class(list(neural_network.parameters()), **clustering_optimizer_params) # DeepECT Training loop @@ -651,7 +664,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> "DeepECT": self.neural_network, self.neural_network_weights, self.embedding_size, self.clustering_loss_weight, self.ssl_loss_weight, self.custom_dataloaders, self.augmentation_invariance, self.device, - random_state) + self._log_history, random_state) self.tree_ = tree self.labels_ = labels self.neural_network_trained_ = neural_network diff --git a/clustpy/deep/den.py b/clustpy/deep/den.py index 7306175..80336b0 100644 --- a/clustpy/deep/den.py +++ b/clustpy/deep/den.py @@ -229,7 +229,7 @@ def _loss(self, batch: list, group_size: list, neural_network: torch.nn.Module, # Calculate group sparsity constraint group_sparsity_loss = self._group_sparsity_loss(embedded, group_size) loss = ssl_loss + self.weight_locality_constraint * locality_preserving_loss + self.weight_sparsity_constraint * group_sparsity_loss - return loss + return loss, ssl_loss, locality_preserving_loss, group_sparsity_loss def _get_nearest_neighbors(self, X: np.ndarray) -> list: @@ -294,15 +294,25 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DEN': for _ in tbar: # Update Network total_loss = 0 + total_ssl_loss = 0 + total_locality_loss = 0 + total_sparsity_loss = 0 for batch in trainloader: loss = self._loss(batch, group_size, neural_network, device) - total_loss += loss.item() + total_loss += loss[0].item() + total_ssl_loss += loss[1].item() + total_locality_loss += loss[2].item() + total_sparsity_loss += loss[3].item() # Backward pass - update weights optimizer.zero_grad() - loss.backward() + loss[0].backward() optimizer.step() postfix_str = {"Loss": total_loss} tbar.set_postfix(postfix_str) + self._log_history("Total Loss", total_loss) + self._log_history("SSL Loss", total_ssl_loss) + self._log_history("Locality Loss", total_locality_loss) + self._log_history("Sparsity Loss", total_sparsity_loss) # Execute clustering with Kmeans embedded_data = encode_batchwise(testloader, neural_network) kmeans = KMeans(n_clusters=self.n_clusters, random_state=random_state) @@ -312,4 +322,5 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DEN': self.cluster_centers_ = kmeans.cluster_centers_ self.neural_network_trained_ = neural_network self.set_n_featrues_in(X) + return self diff --git a/clustpy/deep/dkm.py b/clustpy/deep/dkm.py index 896a6f3..f2295ba 100644 --- a/clustpy/deep/dkm.py +++ b/clustpy/deep/dkm.py @@ -20,7 +20,7 @@ def _dkm(X: np.ndarray, n_clusters: int, alphas: list | tuple, batch_size: int, neural_network: torch.nn.Module | tuple, neural_network_weights: str, embedding_size: int, clustering_loss_weight: float, ssl_loss_weight: float, custom_dataloaders: tuple, augmentation_invariance: bool, initial_clustering_class: ClusterMixin, - initial_clustering_params: dict, device: torch.device, random_state: np.random.RandomState) -> ( + initial_clustering_params: dict, device: torch.device, random_state: np.random.RandomState,log_fn: Callable | None) -> ( np.ndarray, np.ndarray, np.ndarray, np.ndarray, torch.nn.Module): """ Start the actual DKM clustering procedure on the input data set. @@ -90,9 +90,9 @@ def _dkm(X: np.ndarray, n_clusters: int, alphas: list | tuple, batch_size: int, device, trainloader, testloader, _, neural_network, _, n_clusters, _, init_centers, _ = get_default_deep_clustering_initialization( X, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn, neural_network, embedding_size, custom_dataloaders, initial_clustering_class, initial_clustering_params, device, - random_state, neural_network_weights=neural_network_weights) + random_state, log_fn=log_fn, neural_network_weights=neural_network_weights) # Setup DKM Module - dkm_module = _DKM_Module(init_centers, alphas, augmentation_invariance).to(device) + dkm_module = _DKM_Module(init_centers, alphas, augmentation_invariance,log_fn).to(device) # Use DKM optimizer parameters (usually learning rate is reduced by a magnitude of 10) optimizer = optimizer_class(list(neural_network.parameters()) + list(dkm_module.parameters()), **clustering_optimizer_params) @@ -183,10 +183,12 @@ class _DKM_Module(torch.nn.Module): Is augmentation invariance used """ - def __init__(self, init_centers: np.ndarray, alphas: list, augmentation_invariance: bool = False): + def __init__(self, init_centers: np.ndarray, alphas: list, + augmentation_invariance: bool = False, log_fn: Callable | None = None): super().__init__() self.alphas = alphas self.augmentation_invariance = augmentation_invariance + self.log_fn = log_fn # Centers are learnable parameters self.centers = torch.nn.Parameter(torch.tensor(init_centers), requires_grad=True) @@ -320,7 +322,7 @@ def _loss(self, batch: list, alpha: float, neural_network: torch.nn.Module, clus # Calculate clustering loss cluster_loss = self.dkm_loss(embedded, alpha) loss = ssl_loss_weight * ssl_loss + cluster_loss * clustering_loss_weight - return loss + return loss, ssl_loss, cluster_loss def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.DataLoader, n_epochs: int, device: torch.device, optimizer: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, @@ -357,17 +359,27 @@ def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.Dat for alpha in self.alphas: for _ in range(n_epochs): total_loss = 0 + total_ssl_loss = 0 + total_cluster_loss = 0 for batch in trainloader: loss = self._loss(batch, alpha, neural_network, clustering_loss_weight, ssl_loss_weight, ssl_loss_fn, device) - total_loss += loss.item() + total_loss += loss[0].item() + total_ssl_loss += loss[1].item() + total_cluster_loss += loss[2].item() # Backward pass optimizer.zero_grad() - loss.backward() + loss[0].backward() optimizer.step() postfix_str = {"Loss": total_loss, "Alpha": alpha} tbar.set_postfix(postfix_str) tbar.update() + if self.log_fn is not None: + self.log_fn("Alpha completed", alpha) + self.log_fn("Total Loss", total_loss) + self.log_fn("SSL Loss", total_ssl_loss) + self.log_fn("Clustering Loss", total_cluster_loss) + return self @@ -543,7 +555,8 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DKM': self.initial_clustering_class, initial_clustering_params, self.device, - random_state) + random_state, + self._log_history) self.labels_ = kmeans_labels self.cluster_centers_ = kmeans_centers self.dkm_labels_ = dkm_labels diff --git a/clustpy/deep/neural_networks/_abstract_autoencoder.py b/clustpy/deep/neural_networks/_abstract_autoencoder.py index f582157..59afbae 100644 --- a/clustpy/deep/neural_networks/_abstract_autoencoder.py +++ b/clustpy/deep/neural_networks/_abstract_autoencoder.py @@ -279,7 +279,7 @@ def fit(self, n_epochs: int = 100, optimizer_params: dict = None, batch_size: in optimizer_class: torch.optim.Optimizer = torch.optim.Adam, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error, patience: int = 5, scheduler: torch.optim.lr_scheduler = None, scheduler_params: dict = {}, - corruption_fn: Callable = None, model_path: str = None) -> '_AbstractAutoencoder': + corruption_fn: Callable = None, model_path: str = None,log_fn: Callable[[str, float], None] = None) -> '_AbstractAutoencoder': """ Trains the autoencoder in place. @@ -316,7 +316,8 @@ def fit(self, n_epochs: int = 100, optimizer_params: dict = None, batch_size: in For example, if the data is normalized, this may have to be taken into account in the corruption function - e.g. in case of salt and pepper noise (default: None) model_path : str if specified will save the trained model to the location. If evalloader is used, then only the best model w.r.t. evaluation loss is saved (default: None) - + log_fn : Callable[[str, float], None] + function that takes a string and a float as input and logs the training process (default: None) Returns ------- self : _AbstractAutoencoder @@ -355,6 +356,7 @@ def fit(self, n_epochs: int = 100, optimizer_params: dict = None, batch_size: in # training loop device = get_device_from_module(self) tbar = tqdm.trange(n_epochs, desc="AE training") + for epoch_i in tbar: self.train() total_loss = 0 @@ -384,6 +386,11 @@ def fit(self, n_epochs: int = 100, optimizer_params: dict = None, batch_size: in print(f"Stop training at epoch {best_epoch}. Best Loss: {best_loss:.6f}, Last Loss: {val_loss:.6f}") if scheduler is not None and eval_step_scheduler: scheduler.step(val_loss) + if log_fn is not None: + if evalloader is not None: + log_fn("pretrain/Eval Loss", val_loss.item()) + log_fn("pretrain/Train Loss", total_loss) + tbar.set_postfix(postfix_str) # change to eval mode after training self.eval() diff --git a/clustpy/deep/vade.py b/clustpy/deep/vade.py index b5d6e79..bd1cc6c 100644 --- a/clustpy/deep/vade.py +++ b/clustpy/deep/vade.py @@ -23,7 +23,8 @@ def _vade(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_pa neural_network: torch.nn.Module | tuple, neural_network_weights: str, embedding_size: int, clustering_loss_weight: float, ssl_loss_weight: float, custom_dataloaders: tuple, initial_clustering_class: ClusterMixin, initial_clustering_params: dict, - device: torch.device, random_state: np.random.RandomState) -> ( + device: torch.device, random_state: np.random.RandomState, + log_fn: Callable | None) -> ( np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, torch.nn.Module): """ Start the actual VaDE clustering procedure on the input data set. @@ -88,13 +89,13 @@ def _vade(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_pa device, trainloader, testloader, _, neural_network, _, n_clusters, init_labels, init_means, init_clustering_algo = get_default_deep_clustering_initialization( X, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn, neural_network, embedding_size, custom_dataloaders, initial_clustering_class, initial_clustering_params, device, - random_state, _VaDE_VAE, neural_network_weights=neural_network_weights) + random_state, _VaDE_VAE, log_fn=log_fn, neural_network_weights=neural_network_weights) # Get parameters from initial clustering algorithm init_weights = None if not hasattr(init_clustering_algo, "weights_") else init_clustering_algo.weights_ init_covs = None if not hasattr(init_clustering_algo, "covariances_") else init_clustering_algo.covariances_ # Initialize VaDE vade_module = _VaDE_Module(n_clusters=n_clusters, embedding_size=embedding_size, weights=init_weights, - means=init_means, variances=init_covs).to(device) + means=init_means, variances=init_covs,log_fn=log_fn).to(device) # Use vade learning_rate (usually pretrain_optimizer_params reduced by a magnitude of 10) optimizer = optimizer_class(list(neural_network.parameters()) + list(vade_module.parameters()), **clustering_optimizer_params) @@ -238,7 +239,7 @@ class _VaDE_Module(torch.nn.Module): """ def __init__(self, n_clusters: int, embedding_size: int, weights: torch.Tensor = None, means: torch.Tensor = None, - variances: torch.Tensor = None): + variances: torch.Tensor = None, log_fn: Callable | None = None): super(_VaDE_Module, self).__init__() if weights is None: # if not initialized then use uniform distribution @@ -254,7 +255,7 @@ def __init__(self, n_clusters: int, embedding_size: int, weights: torch.Tensor = embedding_size), "Shape of the initial variances for the Vade_Module must be (n_clusters, embedding_size)" self.p_log_var = torch.nn.Parameter(torch.log(torch.tensor(variances)), requires_grad=True) self.normalize_prob = torch.nn.Softmax(dim=0) - + self.log_fn = log_fn def predict(self, q_mean: torch.Tensor, q_logvar: torch.Tensor) -> torch.Tensor: """ Predict the labels given the specific means and variances of given samples. @@ -305,9 +306,9 @@ def vade_loss(self, neural_network: VariationalAutoencoder, batch_data: torch.Te z, q_mean, q_logvar, reconstruction = neural_network.forward(batch_data) pi_normalized = self.normalize_prob(self.pi) p_c_z = _get_gamma(pi_normalized, self.p_mean, self.p_log_var, z) - loss = _compute_vade_loss(pi_normalized, self.p_mean, self.p_log_var, q_mean, q_logvar, batch_data, p_c_z, + loss, ssl_loss = _compute_vade_loss(pi_normalized, self.p_mean, self.p_log_var, q_mean, q_logvar, batch_data, p_c_z, reconstruction, ssl_loss_fn, clustering_loss_weight, ssl_loss_weight) - return loss + return loss, ssl_loss def fit(self, neural_network: VariationalAutoencoder, testloader: torch.utils.data.DataLoader, trainloader: torch.utils.data.DataLoader, n_epochs: int, device: torch.device, @@ -348,16 +349,21 @@ def fit(self, neural_network: VariationalAutoencoder, testloader: torch.utils.da for _ in tbar: self.train() total_loss = 0 + total_ssl_loss = 0 for batch in trainloader: # load batch on device batch_data = batch[1].to(device) loss = self.vade_loss(neural_network, batch_data, ssl_loss_fn, clustering_loss_weight, ssl_loss_weight) - total_loss += loss.item() + total_loss += loss[0].item() + total_ssl_loss += loss[1].item() optimizer.zero_grad() - loss.backward() + loss[0].backward() optimizer.step() postfix_str = {"Loss": total_loss} + if self.log_fn is not None: + self.log_fn("Total Loss", total_loss) + self.log_fn("SSL Loss", total_ssl_loss) tbar.set_postfix(postfix_str) return self @@ -478,7 +484,7 @@ def _compute_vade_loss(pi: torch.Tensor, p_mean: torch.Tensor, p_log_var: torch. loss = p_z_c - p_c - q_z_x + q_c_x loss /= batch_data.size(0) loss = clustering_loss_weight * loss + ssl_loss_weight * p_x_z # Beware that we do not divide two times by number of samples - return loss + return loss, p_x_z class VaDE(_AbstractDeepClusteringAlgo): @@ -630,7 +636,8 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'VaDE': self.initial_clustering_class, initial_clustering_params, self.device, - random_state) + random_state, + self._log_history) self.labels_ = gmm_labels self.cluster_centers_ = gmm_means self.covariances_ = gmm_covariances From 6fb60bb5b09bde622a43d329dd52cedc420f1cc5 Mon Sep 17 00:00:00 2001 From: mamdouhJ Date: Tue, 30 Dec 2025 16:20:13 +0100 Subject: [PATCH 2/4] added ability to use early stopping by passing a validation set in deep methods --- .gitignore | 1 + clustpy/deep/_train_utils.py | 16 +++++++++---- clustpy/deep/aec.py | 10 ++++---- clustpy/deep/dcn.py | 10 ++++---- clustpy/deep/ddc_n2d.py | 24 ++++++++++++------- clustpy/deep/dec.py | 13 ++++++---- clustpy/deep/deepect.py | 12 ++++++---- clustpy/deep/dipdeck.py | 2 +- clustpy/deep/dipencoder.py | 6 +++-- clustpy/deep/dkm.py | 12 ++++++---- .../neural_networks/_abstract_autoencoder.py | 5 +++- clustpy/deep/vade.py | 11 ++++++--- 12 files changed, 83 insertions(+), 39 deletions(-) diff --git a/.gitignore b/.gitignore index ea0ec8a..34f4ab7 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ __pycache__/ *.py[cod] *$py.class +testing_preprocessing.py # C extensions *.so *.dll diff --git a/clustpy/deep/_train_utils.py b/clustpy/deep/_train_utils.py index d04363c..69f6389 100644 --- a/clustpy/deep/_train_utils.py +++ b/clustpy/deep/_train_utils.py @@ -105,7 +105,7 @@ def get_neural_network(input_dim: int, embedding_size: int = 10, neural_network: return neural_network -def get_trained_network(trainloader: torch.utils.data.DataLoader = None, data: np.ndarray = None, +def get_trained_network(trainloader: torch.utils.data.DataLoader = None, evalloader: torch.utils.data.DataLoader | None = None, data: np.ndarray = None, n_epochs: int = 100, batch_size: int = 128, optimizer_params: dict = None, optimizer_class: torch.optim.Optimizer = torch.optim.Adam, device=None, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error, embedding_size: int = 10, @@ -125,6 +125,8 @@ def get_trained_network(trainloader: torch.utils.data.DataLoader = None, data: n ---------- trainloader : torch.utils.data.DataLoader dataloader used to train neural_network (default: None) + evalloader : torch.utils.data.DataLoader | None + dataloader used for earlystopping during training (default: None) data : np.ndarray train data set. If data is passed then trainloader can remain empty (default: None) n_epochs : int @@ -174,12 +176,12 @@ def get_trained_network(trainloader: torch.utils.data.DataLoader = None, data: n print("Neural network is not fitted yet, will be pretrained.") # Pretrain neural network optimizer_params = {"lr": 1e-3} if optimizer_params is None else optimizer_params - neural_network.fit(n_epochs=n_epochs, optimizer_params=optimizer_params, dataloader=trainloader, + neural_network.fit(n_epochs=n_epochs, optimizer_params=optimizer_params, dataloader=trainloader, evalloader=evalloader, optimizer_class=optimizer_class, ssl_loss_fn=ssl_loss_fn,log_fn=log_fn) return neural_network -def get_default_deep_clustering_initialization(X: np.ndarray | torch.Tensor, n_clusters: int, batch_size: int, +def get_default_deep_clustering_initialization(X: np.ndarray | torch.Tensor, val_set: np.ndarray | torch.Tensor | None, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict, pretrain_epochs: int, optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, @@ -201,6 +203,8 @@ def get_default_deep_clustering_initialization(X: np.ndarray | torch.Tensor, n_c ---------- X : np.ndarray | torch.Tensor the given data set. Can be a np.ndarray or a torch.Tensor + val_set : np.ndarray | torch.Tensor | None + validation data set. Can be a np.ndarray or a torch.Tensor. If None, no validation set will be used n_clusters : int number of clusters. Can be None if a corresponding initial_clustering_class is given, e.g. DBSCAN batch_size : int @@ -257,7 +261,11 @@ def get_default_deep_clustering_initialization(X: np.ndarray | torch.Tensor, n_c """ device = detect_device(device) trainloader, testloader, batch_size = get_train_and_test_dataloader(X, batch_size, custom_dataloaders) - neural_network = get_trained_network(trainloader, n_epochs=pretrain_epochs, + if val_set is not None: + evalloader = get_dataloader(val_set, batch_size, shuffle=True) + else: + evalloader = None + neural_network = get_trained_network(trainloader, evalloader= evalloader, n_epochs=pretrain_epochs, optimizer_params=pretrain_optimizer_params, optimizer_class=optimizer_class, device=device, ssl_loss_fn=ssl_loss_fn, embedding_size=embedding_size, neural_network=neural_network, neural_network_class=neural_network_class, diff --git a/clustpy/deep/aec.py b/clustpy/deep/aec.py index 92b3eb6..6ebb6ff 100644 --- a/clustpy/deep/aec.py +++ b/clustpy/deep/aec.py @@ -14,7 +14,7 @@ from collections.abc import Callable -def _aec(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict, +def _aec(X: np.ndarray, val_set: np.ndarray | None, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict, clustering_optimizer_params: dict, pretrain_epochs: int, clustering_epochs: int, optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, neural_network: torch.nn.Module | tuple, neural_network_weights: str, @@ -31,6 +31,8 @@ def _aec(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_par the given data set. Can be a np.ndarray or a torch.Tensor n_clusters : int number of clusters. Can be None if a corresponding initial_clustering_class is given, that can determine the number of clusters, e.g. DBSCAN + val_set : np.ndarray | None + Optional validation set for early stopping. If not None, Early stopping will be used batch_size : int size of the data batches pretrain_optimizer_params : dict @@ -86,7 +88,7 @@ def _aec(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_par """ # Get initial setting (device, dataloaders, pretrained AE and initial clustering result) device, trainloader, testloader, _, neural_network, _, n_clusters, init_labels, init_centers, _ = get_default_deep_clustering_initialization( - X, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn, + X, val_set, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn, neural_network, embedding_size, custom_dataloaders, initial_clustering_class, initial_clustering_params, device, random_state, log_fn=log_fn, neural_network_weights=neural_network_weights) # Setup AEC Module @@ -329,7 +331,7 @@ def __init__(self, n_clusters: int = 8, batch_size: int = 256, pretrain_optimize self.initial_clustering_class = initial_clustering_class self.initial_clustering_params = initial_clustering_params - def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'AEC': + def fit(self, X: np.ndarray, val_set: np.ndarray = None, y: np.ndarray = None) -> 'AEC': """ Initiate the actual clustering process on the input data set. The resulting cluster labels will be stored in the labels_ attribute. @@ -347,7 +349,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'AEC': this instance of the AEC algorithm """ X, _, random_state, pretrain_optimizer_params, clustering_optimizer_params, initial_clustering_params = self._check_parameters(X, y=y) - aec_labels, aec_centers, neural_network = _aec(X, self.n_clusters, self.batch_size, + aec_labels, aec_centers, neural_network = _aec(X, val_set, self.n_clusters, self.batch_size, pretrain_optimizer_params, clustering_optimizer_params, self.pretrain_epochs, diff --git a/clustpy/deep/dcn.py b/clustpy/deep/dcn.py index 79a4ccb..f3a1133 100644 --- a/clustpy/deep/dcn.py +++ b/clustpy/deep/dcn.py @@ -15,7 +15,7 @@ from collections.abc import Callable -def _dcn(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict, +def _dcn(X: np.ndarray, val_Set: np.ndarray | None, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict, clustering_optimizer_params: dict, pretrain_epochs: int, clustering_epochs: int, optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, neural_network: torch.nn.Module | tuple, neural_network_weights: str, @@ -33,6 +33,8 @@ def _dcn(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_par the given data set. Can be a np.ndarray or a torch.Tensor n_clusters : int number of clusters. Can be None if a corresponding initial_clustering_class is given, that can determine the number of clusters, e.g. DBSCAN + val_set : np.ndarray | None + Optional validation set for early stopping. If not None, Early stopping will be used batch_size : int size of the data batches pretrain_optimizer_params : dict @@ -88,7 +90,7 @@ def _dcn(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_par """ # Get initial setting (device, dataloaders, pretrained AE and initial clustering result) device, trainloader, testloader, _, neural_network, _, n_clusters, init_labels, init_centers, _ = get_default_deep_clustering_initialization( - X, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn, + X, val_set, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn, neural_network, embedding_size, custom_dataloaders, initial_clustering_class, initial_clustering_params, device, random_state, log_fn=log_fn, neural_network_weights=neural_network_weights) # Setup DCN Module @@ -488,7 +490,7 @@ def __init__(self, n_clusters: int = 8, batch_size: int = 256, pretrain_optimize self.initial_clustering_class = initial_clustering_class self.initial_clustering_params = initial_clustering_params - def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DCN': + def fit(self, X: np.ndarray, val_set: np.ndarray = None, y: np.ndarray = None) -> 'DCN': """ Initiate the actual clustering process on the input data set. The resulting cluster labels will be stored in the labels_ attribute. @@ -506,7 +508,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DCN': this instance of the DCN algorithm """ X, _, random_state, pretrain_optimizer_params, clustering_optimizer_params, initial_clustering_params = self._check_parameters(X, y=y) - kmeans_labels, kmeans_centers, dcn_labels, dcn_centers, neural_network = _dcn(X, self.n_clusters, + kmeans_labels, kmeans_centers, dcn_labels, dcn_centers, neural_network = _dcn(X, val_Set, self.n_clusters, self.batch_size, pretrain_optimizer_params, clustering_optimizer_params, diff --git a/clustpy/deep/ddc_n2d.py b/clustpy/deep/ddc_n2d.py index 291bf6b..71515ce 100644 --- a/clustpy/deep/ddc_n2d.py +++ b/clustpy/deep/ddc_n2d.py @@ -6,7 +6,7 @@ import torch import numpy as np from clustpy.deep._utils import detect_device, encode_batchwise, run_initial_clustering, mean_squared_error -from clustpy.deep._data_utils import get_train_and_test_dataloader +from clustpy.deep._data_utils import get_train_and_test_dataloader, get_dataloader from clustpy.deep._train_utils import get_trained_network from clustpy.deep._abstract_deep_clustering_algo import _AbstractDeepClusteringAlgo from sklearn.manifold import TSNE @@ -18,7 +18,7 @@ from clustpy.utils.checks import check_parameters -def _manifold_based_sequential_dc(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict, +def _manifold_based_sequential_dc(X: np.ndarray, val_set: np.ndarray | None, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict, pretrain_epochs: int, optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, neural_network: torch.nn.Module | tuple, neural_network_weights: str, embedding_size: int, custom_dataloaders: tuple, @@ -33,6 +33,8 @@ def _manifold_based_sequential_dc(X: np.ndarray, n_clusters: int, batch_size: in ---------- X : np.ndarray / torch.Tensor the given data set. Can be a np.ndarray or a torch.Tensor + val_set : np.ndarray / torch.Tensor | None + validation set (can be ignored) n_clusters : int number of clusters (can be None) batch_size : int @@ -85,8 +87,12 @@ def _manifold_based_sequential_dc(X: np.ndarray, n_clusters: int, batch_size: in # Get the device to train on device = detect_device(device) trainloader, testloader, _ = get_train_and_test_dataloader(X, batch_size, custom_dataloaders) + if val_set is not None: + valloader = get_dataloader(val_set, batch_size, shuffle=False) + else: + valloader = None # Get initial AE - neural_network = get_trained_network(trainloader, n_epochs=pretrain_epochs, + neural_network = get_trained_network(trainloader,valloader, n_epochs=pretrain_epochs, optimizer_params=pretrain_optimizer_params, optimizer_class=optimizer_class, device=device, ssl_loss_fn=ssl_loss_fn, embedding_size=embedding_size, neural_network=neural_network, neural_network_weights=neural_network_weights, @@ -137,7 +143,7 @@ class DDC_density_peak_clustering(ClusterMixin, BaseEstimator): def __init__(self, ratio: float): self.ratio = ratio - def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DDC_density_peak_clustering': + def fit(self, X: np.ndarray, val_set: np.ndarray = None, y: np.ndarray = None) -> 'DDC_density_peak_clustering': """ Initiate the actual clustering process on the input data set. The resulting cluster labels will be stored in the labels_ attribute. @@ -325,7 +331,7 @@ def __init__(self, ratio: float = 0.1, batch_size: int = 256, pretrain_optimizer self.custom_dataloaders = custom_dataloaders self.tsne_params = tsne_params - def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DDC': + def fit(self, X: np.ndarray, val_set: np.ndarray = None, y: np.ndarray = None) -> 'DDC': """ Initiate the actual clustering process on the input data set. The resulting cluster labels will be stored in the labels_ attribute. @@ -346,7 +352,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DDC': tsne_params = {"n_components": 2} if self.tsne_params is None else self.tsne_params if self.ratio > 1: print("[WARNING] ratio for DDC algorithm has been set to a value > 1 which can cause poor results") - n_clusters, labels, centers_ae, _, neural_network, tsne = _manifold_based_sequential_dc(X, None, self.batch_size, + n_clusters, labels, centers_ae, _, neural_network, tsne = _manifold_based_sequential_dc(X,val_set, None, self.batch_size, pretrain_optimizer_params, self.pretrain_epochs, self.optimizer_class, @@ -472,7 +478,7 @@ def __init__(self, n_clusters: int = 8, batch_size: int = 256, pretrain_optimize self.manifold_params = manifold_params self.initial_clustering_params = initial_clustering_params - def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'N2D': + def fit(self, X: np.ndarray, val_set: np.ndarray = None, y: np.ndarray = None) -> 'N2D': """ Initiate the actual clustering process on the input data set. The resulting cluster labels will be stored in the labels_ attribute. @@ -481,6 +487,8 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'N2D': ---------- X : np.ndarray the given data set + val_set : np.ndarray + validation set (can be ignored) y : np.ndarray the labels (can be ignored) @@ -491,7 +499,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'N2D': """ X, _, random_state, pretrain_optimizer_params, _, initial_clustering_params = self._check_parameters(X, y=y) manifold_params = {"n_components": self.n_clusters} if self.manifold_params is None else self.manifold_params - _, labels, centers_ae, centers_manifold, neural_network, manifold = _manifold_based_sequential_dc(X, self.n_clusters, + _, labels, centers_ae, centers_manifold, neural_network, manifold = _manifold_based_sequential_dc(X, val_set, self.n_clusters, self.batch_size, pretrain_optimizer_params, self.pretrain_epochs, diff --git a/clustpy/deep/dec.py b/clustpy/deep/dec.py index b7518ee..342ec81 100644 --- a/clustpy/deep/dec.py +++ b/clustpy/deep/dec.py @@ -16,7 +16,7 @@ from collections.abc import Callable# -def _dec(X: np.ndarray, n_clusters: int, alpha: float, batch_size: int, pretrain_optimizer_params: dict, +def _dec(X: np.ndarray,val_set: np.ndarray | None, n_clusters: int, alpha: float, batch_size: int, pretrain_optimizer_params: dict, clustering_optimizer_params: dict, pretrain_epochs: int, clustering_epochs: int, optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, neural_network: torch.nn.Module | tuple, neural_network_weights: str, embedding_size: int, @@ -32,6 +32,8 @@ def _dec(X: np.ndarray, n_clusters: int, alpha: float, batch_size: int, pretrain ---------- X : np.ndarray / torch.Tensor the given data set. Can be a np.ndarray or a torch.Tensor + val_set : np.ndarray | None + Optional validation set for early stopping. If not None, Early stopping will be used n_clusters : int number of clusters. Can be None if a corresponding initial_clustering_class is given, that can determine the number of clusters, e.g. DBSCAN alpha : float @@ -89,7 +91,7 @@ def _dec(X: np.ndarray, n_clusters: int, alpha: float, batch_size: int, pretrain """ # Get initial setting (device, dataloaders, pretrained AE and initial clustering result) device, trainloader, testloader, _, neural_network, _, n_clusters, _, init_centers, _ = get_default_deep_clustering_initialization( - X, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn, + X, val_set, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn, neural_network, embedding_size, custom_dataloaders, initial_clustering_class, initial_clustering_params, device, random_state, log_fn=log_fn, neural_network_weights=neural_network_weights) # Setup DEC Module @@ -525,7 +527,7 @@ def __init__(self, n_clusters: int = 8, alpha: float = 1.0, batch_size: int = 25 self.initial_clustering_class = initial_clustering_class self.initial_clustering_params = initial_clustering_params - def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DEC': + def fit(self, X: np.ndarray,val_set: np.ndarray | None = None, y: np.ndarray = None) -> 'DEC': """ Initiate the actual clustering process on the input data set. The resulting cluster labels will be stored in the labels_ attribute. @@ -534,6 +536,9 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DEC': ---------- X : np.ndarray the given data set + val_set : np.ndarray | None + optional validation set for monitoring purposes (can be ignored) + y : np.ndarray the labels (can be ignored) @@ -544,7 +549,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DEC': """ ssl_loss_weight = self.ssl_loss_weight if hasattr(self, "ssl_loss_weight") else 0 # DEC does not use ssl loss when clustering X, _, random_state, pretrain_optimizer_params, clustering_optimizer_params, initial_clustering_params = self._check_parameters(X, y=y) - kmeans_labels, kmeans_centers, dec_labels, dec_centers, neural_network = _dec(X, self.n_clusters, self.alpha, + kmeans_labels, kmeans_centers, dec_labels, dec_centers, neural_network = _dec(X,val_set, self.n_clusters, self.alpha, self.batch_size, pretrain_optimizer_params, clustering_optimizer_params, diff --git a/clustpy/deep/deepect.py b/clustpy/deep/deepect.py index 86da71b..364ec2e 100644 --- a/clustpy/deep/deepect.py +++ b/clustpy/deep/deepect.py @@ -454,7 +454,7 @@ def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.Dat return self -def _deep_ect(X: np.ndarray, max_n_leaf_nodes: int, batch_size: int, pretrain_optimizer_params: dict, +def _deep_ect(X: np.ndarray, val_set: np.ndarray | None, max_n_leaf_nodes: int, batch_size: int, pretrain_optimizer_params: dict, clustering_optimizer_params: dict, pretrain_epochs: int, clustering_epochs: int, grow_interval: int, pruning_threshold: float, optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, neural_network: torch.nn.Module | tuple, @@ -468,6 +468,8 @@ def _deep_ect(X: np.ndarray, max_n_leaf_nodes: int, batch_size: int, pretrain_op ---------- X : np.ndarray The given data set. Can be a np.ndarray or a torch.Tensor + val_set : np.ndarray + validation set (can be ignored) max_n_leaf_nodes : int Maximum number of leaf nodes in the cluster tree batch_size : int @@ -522,7 +524,7 @@ def _deep_ect(X: np.ndarray, max_n_leaf_nodes: int, batch_size: int, pretrain_op """ # Get initial setting (device, dataloaders, pretrained AE and initial clustering result) device, trainloader, testloader, _, neural_network, _, _, _, init_leafnode_centers, _ = get_default_deep_clustering_initialization( - X, 2, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn, + X, val_set, 2, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn, neural_network, embedding_size, custom_dataloaders, KMeans, {"n_init": 20}, device, random_state, log_fn=log_fn, neural_network_weights=neural_network_weights) cluster_tree = BinaryClusterTree(_DeepECT_ClusterTreeNode) @@ -639,7 +641,7 @@ def __init__(self, max_n_leaf_nodes: int = 20, batch_size: int = 256, pretrain_o self.custom_dataloaders = custom_dataloaders self.augmentation_invariance = augmentation_invariance - def fit(self, X: np.ndarray, y: np.ndarray = None) -> "DeepECT": + def fit(self, X: np.ndarray, val_set: np.ndarray = None, y: np.ndarray = None) -> "DeepECT": """ Initiate the actual clustering process on the input data set. The resulting cluster labels will be stored in the labels_ attribute. @@ -648,6 +650,8 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> "DeepECT": ---------- X : np.ndarray the given data set + val_set : np.ndarray + validation set (can be ignored) y : np.ndarray the labels (can be ignored) @@ -657,7 +661,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> "DeepECT": This instance of the DeepECT algorithm """ X, _, random_state, pretrain_optimizer_params, clustering_optimizer_params, _ = self._check_parameters(X, y=y) - tree, labels, neural_network = _deep_ect(X, self.max_n_leaf_nodes, self.batch_size, + tree, labels, neural_network = _deep_ect(X, val_set, self.max_n_leaf_nodes, self.batch_size, pretrain_optimizer_params, clustering_optimizer_params, self.pretrain_epochs, self.clustering_epochs, self.grow_interval, self.pruning_threshold, self.optimizer_class, self.ssl_loss_fn, diff --git a/clustpy/deep/dipdeck.py b/clustpy/deep/dipdeck.py index 8433330..7d9065b 100644 --- a/clustpy/deep/dipdeck.py +++ b/clustpy/deep/dipdeck.py @@ -469,7 +469,7 @@ def _loss(self, batch: list, neural_network: torch.nn.Module, ssl_loss_fn: Calla loss = ssl_loss_weight * ssl_loss + clustering_loss_weight * cluster_loss return loss - def fit(self, X, neural_network: torch.nn.Module, trainloader: torch.utils.data.DataLoader, + def fit(self, X, val_set: np.ndarray | None, neural_network: torch.nn.Module, trainloader: torch.utils.data.DataLoader, testloader: torch.utils.data.DataLoader, n_epochs: int, device: torch.device, optimizer: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, clustering_loss_weight: float, ssl_loss_weight: float, debug: bool) -> '_DipDECK_Module': diff --git a/clustpy/deep/dipencoder.py b/clustpy/deep/dipencoder.py index bf6774e..4d82748 100644 --- a/clustpy/deep/dipencoder.py +++ b/clustpy/deep/dipencoder.py @@ -785,7 +785,7 @@ def __init__(self, n_clusters: int = 8, batch_size: int = None, pretrain_optimiz self.initial_clustering_class = initial_clustering_class self.initial_clustering_params = initial_clustering_params - def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DipEncoder': + def fit(self, X: np.ndarray, val_set: np.ndarray = None, y: np.ndarray = None) -> 'DipEncoder': """ Initiate the actual clustering/dimensionality reduction process on the input data set. If no ground truth labels are given, the resulting cluster labels will be stored in the labels_ attribute. @@ -794,6 +794,8 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DipEncoder': ---------- X : np.ndarray The given (training) data set + val_set : np.ndarray + The validation data set (not used in DipEncoder, included for compatibility reasons) (default: None) y : np.ndarray The ground truth labels. If None, the DipEncoder will be used for clustering (default: None) @@ -807,7 +809,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DipEncoder': batch_size = 25 * self.n_clusters if self.batch_size is None else self.batch_size # Get initial setting (device, dataloaders, pretrained AE and initial clustering result) device, trainloader, testloader, _, neural_network, X_embed, n_clusters, init_labels, init_centers, _ = get_default_deep_clustering_initialization( - X, self.n_clusters, batch_size, pretrain_optimizer_params, self.pretrain_epochs, self.optimizer_class, self.ssl_loss_fn, + X, val_set, self.n_clusters, batch_size, pretrain_optimizer_params, self.pretrain_epochs, self.optimizer_class, self.ssl_loss_fn, self.neural_network, self.embedding_size, self.custom_dataloaders, self.initial_clustering_class if y is None else None, initial_clustering_params, self.device, random_state, neural_network_weights=self.neural_network_weights) if y is not None: diff --git a/clustpy/deep/dkm.py b/clustpy/deep/dkm.py index f2295ba..65426ad 100644 --- a/clustpy/deep/dkm.py +++ b/clustpy/deep/dkm.py @@ -14,7 +14,7 @@ from collections.abc import Callable -def _dkm(X: np.ndarray, n_clusters: int, alphas: list | tuple, batch_size: int, pretrain_optimizer_params: dict, +def _dkm(X: np.ndarray, val_set: np.ndarray | None, n_clusters: int, alphas: list | tuple, batch_size: int, pretrain_optimizer_params: dict, clustering_optimizer_params: dict, pretrain_epochs: int, clustering_epochs: int, optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, neural_network: torch.nn.Module | tuple, neural_network_weights: str, embedding_size: int, @@ -29,6 +29,8 @@ def _dkm(X: np.ndarray, n_clusters: int, alphas: list | tuple, batch_size: int, ---------- X : np.ndarray / torch.Tensor the given data set. Can be a np.ndarray or a torch.Tensor + val_set : np.ndarray + validation set (can be ignored) n_clusters : int number of clusters. Can be None if a corresponding initial_clustering_class is given, that can determine the number of clusters, e.g. DBSCAN alphas : list | tuple @@ -88,7 +90,7 @@ def _dkm(X: np.ndarray, n_clusters: int, alphas: list | tuple, batch_size: int, """ # Get initial setting (device, dataloaders, pretrained AE and initial clustering result) device, trainloader, testloader, _, neural_network, _, n_clusters, _, init_centers, _ = get_default_deep_clustering_initialization( - X, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn, + X, val_set, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn, neural_network, embedding_size, custom_dataloaders, initial_clustering_class, initial_clustering_params, device, random_state, log_fn=log_fn, neural_network_weights=neural_network_weights) # Setup DKM Module @@ -518,7 +520,7 @@ def _check_alphas(self) -> list: assert type(alphas) is tuple or type(alphas) is list, "alphas must be a list, int or tuple" return alphas - def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DKM': + def fit(self, X: np.ndarray, val_set: np.ndarray = None, y: np.ndarray = None) -> 'DKM': """ Initiate the actual clustering process on the input data set. The resulting cluster labels will be stored in the labels_ attribute. @@ -527,6 +529,8 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DKM': ---------- X : np.ndarray the given data set + val_set : np.ndarray + validation set (can be ignored) y : np.ndarray the labels (can be ignored) @@ -537,7 +541,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DKM': """ X, _, random_state, pretrain_optimizer_params, clustering_optimizer_params, initial_clustering_params = self._check_parameters(X, y=y) alphas = self._check_alphas() - kmeans_labels, kmeans_centers, dkm_labels, dkm_centers, neural_network = _dkm(X, self.n_clusters, alphas, + kmeans_labels, kmeans_centers, dkm_labels, dkm_centers, neural_network = _dkm(X, val_set, self.n_clusters, alphas, self.batch_size, pretrain_optimizer_params, clustering_optimizer_params, diff --git a/clustpy/deep/neural_networks/_abstract_autoencoder.py b/clustpy/deep/neural_networks/_abstract_autoencoder.py index 59afbae..3bb3d51 100644 --- a/clustpy/deep/neural_networks/_abstract_autoencoder.py +++ b/clustpy/deep/neural_networks/_abstract_autoencoder.py @@ -266,7 +266,7 @@ def evaluate(self, dataloader: torch.utils.data.DataLoader, ssl_loss_fn: Callabl """ with torch.no_grad(): self.eval() - loss = torch.tensor(0.) + loss = torch.tensor(0.0,device=device) for batch in dataloader: new_loss, _, _ = self.loss(batch, ssl_loss_fn, device) loss += new_loss @@ -384,8 +384,11 @@ def fit(self, n_epochs: int = 100, optimizer_params: dict = None, batch_size: in self.save_parameters(model_path) if early_stopping.early_stop: print(f"Stop training at epoch {best_epoch}. Best Loss: {best_loss:.6f}, Last Loss: {val_loss:.6f}") + break if scheduler is not None and eval_step_scheduler: scheduler.step(val_loss) + if log_fn is not None: + log_fn("pretrain/Eval Loss", val_loss.item()) if log_fn is not None: if evalloader is not None: log_fn("pretrain/Eval Loss", val_loss.item()) diff --git a/clustpy/deep/vade.py b/clustpy/deep/vade.py index bd1cc6c..06ccc23 100644 --- a/clustpy/deep/vade.py +++ b/clustpy/deep/vade.py @@ -17,7 +17,7 @@ from collections.abc import Callable -def _vade(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict, +def _vade(X: np.ndarray, val_set: np.ndarray | None, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict, clustering_optimizer_params: dict, pretrain_epochs: int, clustering_epochs: int, optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, neural_network: torch.nn.Module | tuple, neural_network_weights: str, @@ -33,6 +33,8 @@ def _vade(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_pa ---------- X : np.ndarray / torch.Tensor the given data set. Can be a np.ndarray or a torch.Tensor + val_set : np.ndarray / torch.Tensor | None + validation set (can be ignored) n_clusters : int number of clusters. Can be None if a corresponding initial_clustering_class is given, that can determine the number of clusters, e.g. DBSCAN batch_size : int @@ -87,7 +89,7 @@ def _vade(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_pa """ # Get initial setting (device, dataloaders, pretrained AE and initial clustering result) device, trainloader, testloader, _, neural_network, _, n_clusters, init_labels, init_means, init_clustering_algo = get_default_deep_clustering_initialization( - X, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn, + X, val_set, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn, neural_network, embedding_size, custom_dataloaders, initial_clustering_class, initial_clustering_params, device, random_state, _VaDE_VAE, log_fn=log_fn, neural_network_weights=neural_network_weights) # Get parameters from initial clustering algorithm @@ -595,7 +597,7 @@ def __init__(self, n_clusters: int = 8, batch_size: int = 256, pretrain_optimize self.initial_clustering_class = initial_clustering_class self.initial_clustering_params = initial_clustering_params - def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'VaDE': + def fit(self, X: np.ndarray, val_set: np.ndarray = None, y: np.ndarray = None) -> 'VaDE': """ Initiate the actual clustering process on the input data set. The resulting cluster labels will be stored in the labels_ attribute. @@ -604,6 +606,8 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'VaDE': ---------- X : np.ndarray the given data set + val_set : np.ndarray + validation set (can be ignored) y : np.ndarray the labels (can be ignored) @@ -619,6 +623,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'VaDE': "covariance_type": "diag"} if self.initial_clustering_params is None else self.initial_clustering_params gmm_labels, gmm_means, gmm_covariances, gmm_weights, vade_labels, vade_centers, vade_covariances, neural_network = _vade( X, + val_set, self.n_clusters, self.batch_size, pretrain_optimizer_params, From a227859fed4d1220a6005bebb0f8ed981c280fa5 Mon Sep 17 00:00:00 2001 From: Mamdouh Aljoud <89313991+mamdouhJ@users.noreply.github.com> Date: Thu, 8 Jan 2026 20:34:16 +0100 Subject: [PATCH 3/4] Fixed seeding in autoencoder isntances --- clustpy/deep/neural_networks/_abstract_autoencoder.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/clustpy/deep/neural_networks/_abstract_autoencoder.py b/clustpy/deep/neural_networks/_abstract_autoencoder.py index 3bb3d51..86e661f 100644 --- a/clustpy/deep/neural_networks/_abstract_autoencoder.py +++ b/clustpy/deep/neural_networks/_abstract_autoencoder.py @@ -118,6 +118,9 @@ def __init__(self, work_on_copy: bool = True, random_state: np.random.RandomStat self.random_state = random_state self.fitted = False self.allow_nd_input = False + rs = check_random_state(self.random_state) + set_torch_seed(rs) + print("Ich bin da _AbstractAutoencoder") def encode(self, x: torch.Tensor) -> torch.Tensor: """ From 13f5f36ee888290437b6c08ca79789025f2a381f Mon Sep 17 00:00:00 2001 From: Mamdouh Aljoud <89313991+mamdouhJ@users.noreply.github.com> Date: Mon, 12 Jan 2026 15:12:46 +0100 Subject: [PATCH 4/4] DDC accepts n_clusters --- clustpy/deep/ddc_n2d.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/clustpy/deep/ddc_n2d.py b/clustpy/deep/ddc_n2d.py index 71515ce..e1dd995 100644 --- a/clustpy/deep/ddc_n2d.py +++ b/clustpy/deep/ddc_n2d.py @@ -316,13 +316,14 @@ class DDC(_AbstractDeepClusteringAlgo): Knowledge-Based Systems 197 (2020): 105841. """ - def __init__(self, ratio: float = 0.1, batch_size: int = 256, pretrain_optimizer_params: dict = None, + def __init__(self,n_clusters: int = None, ratio: float = 0.1, batch_size: int = 256, pretrain_optimizer_params: dict = None, pretrain_epochs: int = 100, optimizer_class: torch.optim.Optimizer = torch.optim.Adam, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error, neural_network: torch.nn.Module | tuple = None, neural_network_weights: str = None, embedding_size: int = 10, custom_dataloaders: tuple = None, tsne_params: dict = None, device: torch.device = None, random_state: np.random.RandomState | int = None): super().__init__(batch_size, neural_network, neural_network_weights, embedding_size, device, random_state) + self.n_clusters = n_clusters self.ratio = ratio self.pretrain_optimizer_params = pretrain_optimizer_params self.pretrain_epochs = pretrain_epochs @@ -352,7 +353,7 @@ def fit(self, X: np.ndarray, val_set: np.ndarray = None, y: np.ndarray = None) - tsne_params = {"n_components": 2} if self.tsne_params is None else self.tsne_params if self.ratio > 1: print("[WARNING] ratio for DDC algorithm has been set to a value > 1 which can cause poor results") - n_clusters, labels, centers_ae, _, neural_network, tsne = _manifold_based_sequential_dc(X,val_set, None, self.batch_size, + n_clusters, labels, centers_ae, _, neural_network, tsne = _manifold_based_sequential_dc(X,val_set, self.n_clusters, self.batch_size, pretrain_optimizer_params, self.pretrain_epochs, self.optimizer_class,