diff --git a/.gitignore b/.gitignore index ea0ec8a..34f4ab7 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ __pycache__/ *.py[cod] *$py.class +testing_preprocessing.py # C extensions *.so *.dll diff --git a/clustpy/deep/_abstract_deep_clustering_algo.py b/clustpy/deep/_abstract_deep_clustering_algo.py index c776ff6..a9e04ad 100644 --- a/clustpy/deep/_abstract_deep_clustering_algo.py +++ b/clustpy/deep/_abstract_deep_clustering_algo.py @@ -1,3 +1,4 @@ +from collections import defaultdict from clustpy.deep._utils import set_torch_seed from sklearn.base import TransformerMixin, BaseEstimator, ClusterMixin import numpy as np @@ -37,6 +38,20 @@ def __init__(self, batch_size: int, neural_network: torch.nn.Module | tuple, neu self.embedding_size = embedding_size self.device = device self.random_state = random_state + self.history_ = defaultdict(list) + + def _log_history(self, key: str, value) -> None: + """ + Log pretraining and clustering history values (e.g. loss values) during training. + + Parameters + ---------- + key : str + the key under which to store the value + value : float + + """ + self.history_[key].append(float(value)) def _check_parameters(self, X: np.ndarray, *, y: np.ndarray=None) -> (np.ndarray, np.ndarray, np.random.RandomState, dict, dict, dict): """ diff --git a/clustpy/deep/_train_utils.py b/clustpy/deep/_train_utils.py index e776d96..69f6389 100644 --- a/clustpy/deep/_train_utils.py +++ b/clustpy/deep/_train_utils.py @@ -105,13 +105,15 @@ def get_neural_network(input_dim: int, embedding_size: int = 10, neural_network: return neural_network -def get_trained_network(trainloader: torch.utils.data.DataLoader = None, data: np.ndarray = None, +def get_trained_network(trainloader: torch.utils.data.DataLoader = None, evalloader: torch.utils.data.DataLoader | None = None, data: np.ndarray = None, n_epochs: int = 100, batch_size: int = 128, optimizer_params: dict = None, optimizer_class: torch.optim.Optimizer = torch.optim.Adam, device=None, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error, embedding_size: int = 10, neural_network: torch.nn.Module | tuple = None, neural_network_class: torch.nn.Module = FeedforwardAutoencoder, - neural_network_params: dict = None, neural_network_weights: str = None, + neural_network_params: dict = None, + log_fn: Callable[[str, float], None] = None, + neural_network_weights: str = None, random_state: np.random.RandomState | int = None) -> torch.nn.Module: """This function returns a trained neural network. The following cases are considered - If the neural network is initialized and trained (neural_network.fitted==True), then return input neural network without training it again. @@ -123,6 +125,8 @@ def get_trained_network(trainloader: torch.utils.data.DataLoader = None, data: n ---------- trainloader : torch.utils.data.DataLoader dataloader used to train neural_network (default: None) + evalloader : torch.utils.data.DataLoader | None + dataloader used for earlystopping during training (default: None) data : np.ndarray train data set. If data is passed then trainloader can remain empty (default: None) n_epochs : int @@ -147,6 +151,8 @@ def get_trained_network(trainloader: torch.utils.data.DataLoader = None, data: n The neural network class that should be used (default: FeedforwardAutoencoder) neural_network_params : dict Parameters to be used when creating a new neural network using the neural_network_class (default: None) + log_fn : Callable[[str, float], None] + Function to log pretraining information such as loss values. It has to take a string (key) and a float (value) as input parameters. neural_network_weights : str Path to a file containing the state_dict of the neural_network (default: None) random_state : np.random.RandomState | int @@ -170,12 +176,12 @@ def get_trained_network(trainloader: torch.utils.data.DataLoader = None, data: n print("Neural network is not fitted yet, will be pretrained.") # Pretrain neural network optimizer_params = {"lr": 1e-3} if optimizer_params is None else optimizer_params - neural_network.fit(n_epochs=n_epochs, optimizer_params=optimizer_params, dataloader=trainloader, - optimizer_class=optimizer_class, ssl_loss_fn=ssl_loss_fn) + neural_network.fit(n_epochs=n_epochs, optimizer_params=optimizer_params, dataloader=trainloader, evalloader=evalloader, + optimizer_class=optimizer_class, ssl_loss_fn=ssl_loss_fn,log_fn=log_fn) return neural_network -def get_default_deep_clustering_initialization(X: np.ndarray | torch.Tensor, n_clusters: int, batch_size: int, +def get_default_deep_clustering_initialization(X: np.ndarray | torch.Tensor, val_set: np.ndarray | torch.Tensor | None, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict, pretrain_epochs: int, optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, @@ -185,6 +191,7 @@ def get_default_deep_clustering_initialization(X: np.ndarray | torch.Tensor, n_c random_state: np.random.RandomState, neural_network_class: torch.nn.Module = FeedforwardAutoencoder, neural_network_params: dict = None, + log_fn: Callable[[str, float], None] = None, neural_network_weights: str = None) -> ( torch.device, torch.utils.data.DataLoader, torch.utils.data.DataLoader, int, torch.nn.Module, np.ndarray, int, np.ndarray, np.ndarray, ClusterMixin): @@ -196,6 +203,8 @@ def get_default_deep_clustering_initialization(X: np.ndarray | torch.Tensor, n_c ---------- X : np.ndarray | torch.Tensor the given data set. Can be a np.ndarray or a torch.Tensor + val_set : np.ndarray | torch.Tensor | None + validation data set. Can be a np.ndarray or a torch.Tensor. If None, no validation set will be used n_clusters : int number of clusters. Can be None if a corresponding initial_clustering_class is given, e.g. DBSCAN batch_size : int @@ -231,6 +240,8 @@ def get_default_deep_clustering_initialization(X: np.ndarray | torch.Tensor, n_c The neural network class that should be used (default: FeedforwardAutoencoder) neural_network_params : dict Parameters to be used when creating a new neural network using the neural_network_class (default: None) + log_fn : Callable[[str, float], None] + Function to log pretraining information such as loss values. It has to take a string (key) and a float (value) as input parameters. neural_network_weights : str Path to a file containing the state_dict of the neural_network (default: None) @@ -250,12 +261,17 @@ def get_default_deep_clustering_initialization(X: np.ndarray | torch.Tensor, n_c """ device = detect_device(device) trainloader, testloader, batch_size = get_train_and_test_dataloader(X, batch_size, custom_dataloaders) - neural_network = get_trained_network(trainloader, n_epochs=pretrain_epochs, + if val_set is not None: + evalloader = get_dataloader(val_set, batch_size, shuffle=True) + else: + evalloader = None + neural_network = get_trained_network(trainloader, evalloader= evalloader, n_epochs=pretrain_epochs, optimizer_params=pretrain_optimizer_params, optimizer_class=optimizer_class, device=device, ssl_loss_fn=ssl_loss_fn, embedding_size=embedding_size, neural_network=neural_network, neural_network_class=neural_network_class, neural_network_params=neural_network_params, neural_network_weights=neural_network_weights, + log_fn=log_fn, random_state=random_state) # Execute initial clustering in embedded space embedded_data = encode_batchwise(testloader, neural_network) diff --git a/clustpy/deep/aec.py b/clustpy/deep/aec.py index fcf6e61..6ebb6ff 100644 --- a/clustpy/deep/aec.py +++ b/clustpy/deep/aec.py @@ -14,14 +14,14 @@ from collections.abc import Callable -def _aec(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict, +def _aec(X: np.ndarray, val_set: np.ndarray | None, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict, clustering_optimizer_params: dict, pretrain_epochs: int, clustering_epochs: int, optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, neural_network: torch.nn.Module | tuple, neural_network_weights: str, embedding_size: int, clustering_loss_weight: float, ssl_loss_weight: float, custom_dataloaders: tuple, augmentation_invariance: bool, initial_clustering_class: ClusterMixin, initial_clustering_params: dict, device: torch.device, - random_state: np.random.RandomState) -> (np.ndarray, np.ndarray, np.ndarray, np.ndarray, torch.nn.Module): + log_fn: Callable | None,random_state: np.random.RandomState) -> (np.ndarray, np.ndarray, np.ndarray, np.ndarray, torch.nn.Module): """ Start the actual AEC clustering procedure on the input data set. @@ -31,6 +31,8 @@ def _aec(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_par the given data set. Can be a np.ndarray or a torch.Tensor n_clusters : int number of clusters. Can be None if a corresponding initial_clustering_class is given, that can determine the number of clusters, e.g. DBSCAN + val_set : np.ndarray | None + Optional validation set for early stopping. If not None, Early stopping will be used batch_size : int size of the data batches pretrain_optimizer_params : dict @@ -70,9 +72,13 @@ def _aec(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_par parameters for the initial clustering class device : torch.device The device on which to perform the computations + log_fn : Callable | None + function for logging training history values (e.g. loss values) during training random_state : np.random.RandomState use a fixed random state to get a repeatable solution + + Returns ------- tuple : (np.ndarray, np.ndarray, np.ndarray, np.ndarray, torch.nn.Module) @@ -82,11 +88,11 @@ def _aec(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_par """ # Get initial setting (device, dataloaders, pretrained AE and initial clustering result) device, trainloader, testloader, _, neural_network, _, n_clusters, init_labels, init_centers, _ = get_default_deep_clustering_initialization( - X, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn, + X, val_set, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn, neural_network, embedding_size, custom_dataloaders, initial_clustering_class, initial_clustering_params, device, - random_state, neural_network_weights=neural_network_weights) + random_state, log_fn=log_fn, neural_network_weights=neural_network_weights) # Setup AEC Module - aec_module = _AEC_Module(init_labels, init_centers, augmentation_invariance).to_device(device) + aec_module = _AEC_Module(init_labels, init_centers, augmentation_invariance,log_fn).to_device(device) # Use AEC optimizer parameters (usually learning rate is reduced by a magnitude of 10) optimizer = optimizer_class(list(neural_network.parameters()), **clustering_optimizer_params) # AEC Training loop @@ -120,11 +126,13 @@ class _AEC_Module(_DCN_Module): the cluster centers augmentation_invariance : bool Is augmentation invariance used + log_fn : Callable | None + function for logging training history values (e.g. loss values) during training """ def __init__(self, init_np_labels: np.ndarray, init_np_centers: np.ndarray, - augmentation_invariance: bool = False): - super().__init__(init_np_labels, init_np_centers, augmentation_invariance) + augmentation_invariance: bool = False, log_fn: Callable | None = None): + super().__init__(init_np_labels, init_np_centers, augmentation_invariance,log_fn) def update_centroids(self, embedded: np.ndarray, labels: np.ndarray) -> torch.Tensor: """ @@ -188,14 +196,18 @@ def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.Dat for _ in tbar: # Update Network total_loss = 0 + total_ssl_loss = 0 + total_clustering_loss = 0 for batch in trainloader: # Beware that the clustering loss of DCN is divided by 2, therefore we use 2 * clustering_loss_weight loss = self._loss(batch, neural_network, ssl_loss_fn, ssl_loss_weight, 2 * clustering_loss_weight, device) - total_loss += loss.item() + total_loss += loss[0].item() + total_ssl_loss += loss[1].item() + total_clustering_loss += loss[2].item() # Backward pass - update weights optimizer.zero_grad() - loss.backward() + loss[0].backward() optimizer.step() postfix_str = {"Loss": total_loss} tbar.set_postfix(postfix_str) @@ -207,6 +219,10 @@ def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.Dat # update assignments labels = self.predict_hard(torch.tensor(embedded).to(device)) self.labels = labels.to(device) + if self.log_fn is not None: + self.log_fn("Total Loss", total_loss) + self.log_fn("SSL Loss", total_ssl_loss) + self.log_fn("Clustering Loss", total_clustering_loss) return self @@ -315,7 +331,7 @@ def __init__(self, n_clusters: int = 8, batch_size: int = 256, pretrain_optimize self.initial_clustering_class = initial_clustering_class self.initial_clustering_params = initial_clustering_params - def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'AEC': + def fit(self, X: np.ndarray, val_set: np.ndarray = None, y: np.ndarray = None) -> 'AEC': """ Initiate the actual clustering process on the input data set. The resulting cluster labels will be stored in the labels_ attribute. @@ -333,7 +349,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'AEC': this instance of the AEC algorithm """ X, _, random_state, pretrain_optimizer_params, clustering_optimizer_params, initial_clustering_params = self._check_parameters(X, y=y) - aec_labels, aec_centers, neural_network = _aec(X, self.n_clusters, self.batch_size, + aec_labels, aec_centers, neural_network = _aec(X, val_set, self.n_clusters, self.batch_size, pretrain_optimizer_params, clustering_optimizer_params, self.pretrain_epochs, @@ -349,6 +365,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'AEC': self.initial_clustering_class, initial_clustering_params, self.device, + self._log_history, random_state) self.labels_ = aec_labels self.cluster_centers_ = aec_centers diff --git a/clustpy/deep/dcn.py b/clustpy/deep/dcn.py index 7fcf906..f3a1133 100644 --- a/clustpy/deep/dcn.py +++ b/clustpy/deep/dcn.py @@ -15,14 +15,15 @@ from collections.abc import Callable -def _dcn(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict, +def _dcn(X: np.ndarray, val_Set: np.ndarray | None, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict, clustering_optimizer_params: dict, pretrain_epochs: int, clustering_epochs: int, optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, neural_network: torch.nn.Module | tuple, neural_network_weights: str, embedding_size: int, clustering_loss_weight: float, ssl_loss_weight: float, custom_dataloaders: tuple, augmentation_invariance: bool, initial_clustering_class: ClusterMixin, initial_clustering_params: dict, device: torch.device, - random_state: np.random.RandomState) -> (np.ndarray, np.ndarray, np.ndarray, np.ndarray, torch.nn.Module): + random_state: np.random.RandomState, + log_fn: Callable | None) -> (np.ndarray, np.ndarray, np.ndarray, np.ndarray, torch.nn.Module): """ Start the actual DCN clustering procedure on the input data set. @@ -32,6 +33,8 @@ def _dcn(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_par the given data set. Can be a np.ndarray or a torch.Tensor n_clusters : int number of clusters. Can be None if a corresponding initial_clustering_class is given, that can determine the number of clusters, e.g. DBSCAN + val_set : np.ndarray | None + Optional validation set for early stopping. If not None, Early stopping will be used batch_size : int size of the data batches pretrain_optimizer_params : dict @@ -71,6 +74,8 @@ def _dcn(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_par parameters for the initial clustering class device : torch.device The device on which to perform the computations + log_fn : Callable | None + function for logging training history values (e.g. loss values) during training random_state : np.random.RandomState use a fixed random state to get a repeatable solution @@ -85,11 +90,11 @@ def _dcn(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_par """ # Get initial setting (device, dataloaders, pretrained AE and initial clustering result) device, trainloader, testloader, _, neural_network, _, n_clusters, init_labels, init_centers, _ = get_default_deep_clustering_initialization( - X, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn, + X, val_set, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn, neural_network, embedding_size, custom_dataloaders, initial_clustering_class, initial_clustering_params, device, - random_state, neural_network_weights=neural_network_weights) + random_state, log_fn=log_fn, neural_network_weights=neural_network_weights) # Setup DCN Module - dcn_module = _DCN_Module(init_labels, init_centers, augmentation_invariance).to_device(device) + dcn_module = _DCN_Module(init_labels, init_centers, augmentation_invariance,log_fn).to_device(device) # Use DCN optimizer parameters (usually learning rate is reduced by a magnitude of 10) optimizer = optimizer_class(list(neural_network.parameters()), **clustering_optimizer_params) # DEC Training loop @@ -152,6 +157,8 @@ class _DCN_Module(torch.nn.Module): augmentation_invariance : bool If True, augmented samples provided in custom_dataloaders[0] will be used to learn cluster assignments that are invariant to the augmentation transformations (default: False) + log_fn : Callable | None + function for logging training history values (e.g. loss values) during training Attributes ---------- @@ -163,7 +170,7 @@ class _DCN_Module(torch.nn.Module): Is augmentation invariance used """ - def __init__(self, init_np_labels: np.ndarray, init_np_centers: np.ndarray, augmentation_invariance: bool = False): + def __init__(self, init_np_labels: np.ndarray, init_np_centers: np.ndarray, augmentation_invariance: bool = False,log_fn: Callable | None=None): super().__init__() self.augmentation_invariance = augmentation_invariance self.labels = torch.from_numpy(init_np_labels) @@ -171,6 +178,7 @@ def __init__(self, init_np_labels: np.ndarray, init_np_centers: np.ndarray, augm # Init for count from original DCN code (not reported in Paper) # This means centroid learning rate at the beginning is scaled by a hundred self.counts = torch.ones(self.centers.shape[0], dtype=torch.int32) * 100 + self.log_fn = log_fn def dcn_loss(self, embedded: torch.Tensor, labels: torch.Tensor) -> torch.Tensor: """ @@ -188,7 +196,7 @@ def dcn_loss(self, embedded: torch.Tensor, labels: torch.Tensor) -> torch.Tensor loss: torch.Tensor the final DCN loss """ - loss = (embedded - self.centers[labels]).pow(2).sum() / embedded.shape[0] + loss = (embedded - self.centers[labels.long()]).pow(2).sum() / embedded.shape[0] return loss def predict_hard(self, embedded: torch.Tensor) -> torch.Tensor: @@ -266,6 +274,8 @@ def _loss(self, batch: list, neural_network: torch.nn.Module, ssl_loss_fn: Calla weight of the clustering loss ssl_loss_weight : float weight of the self-supervised learning (ssl) loss + log_fn : Callable | None + function for logging training history values (e.g. loss values) during training device : torch.device device to be trained on @@ -290,8 +300,7 @@ def _loss(self, batch: list, neural_network: torch.nn.Module, ssl_loss_fn: Calla # compute total loss loss = ssl_loss_weight * ssl_loss + 0.5 * clustering_loss_weight * cluster_loss - - return loss + return loss, ssl_loss, cluster_loss def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.DataLoader, testloader: torch.utils.data.DataLoader, n_epochs: int, device: torch.device, @@ -331,13 +340,17 @@ def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.Dat for _ in tbar: # Update Network total_loss = 0 + ssl_loss = 0 + clustering_loss = 0 for batch in trainloader: loss = self._loss(batch, neural_network, ssl_loss_fn, ssl_loss_weight, clustering_loss_weight, device) - total_loss += loss.item() + total_loss += loss[0].item() + ssl_loss += loss[1].item() + clustering_loss += loss[2].item() # Backward pass - update weights optimizer.zero_grad() - loss.backward() + loss[0].backward() optimizer.step() # Update Assignments and Centroids with torch.no_grad(): @@ -362,6 +375,10 @@ def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.Dat self.counts = counts postfix_str = {"Loss": total_loss} tbar.set_postfix(postfix_str) + if self.log_fn is not None: + self.log_fn("total_loss", total_loss) + self.log_fn("ssl_loss", ssl_loss) + self.log_fn("clustering_loss", clustering_loss) return self @@ -473,7 +490,7 @@ def __init__(self, n_clusters: int = 8, batch_size: int = 256, pretrain_optimize self.initial_clustering_class = initial_clustering_class self.initial_clustering_params = initial_clustering_params - def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DCN': + def fit(self, X: np.ndarray, val_set: np.ndarray = None, y: np.ndarray = None) -> 'DCN': """ Initiate the actual clustering process on the input data set. The resulting cluster labels will be stored in the labels_ attribute. @@ -491,7 +508,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DCN': this instance of the DCN algorithm """ X, _, random_state, pretrain_optimizer_params, clustering_optimizer_params, initial_clustering_params = self._check_parameters(X, y=y) - kmeans_labels, kmeans_centers, dcn_labels, dcn_centers, neural_network = _dcn(X, self.n_clusters, + kmeans_labels, kmeans_centers, dcn_labels, dcn_centers, neural_network = _dcn(X, val_Set, self.n_clusters, self.batch_size, pretrain_optimizer_params, clustering_optimizer_params, @@ -509,7 +526,8 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DCN': self.initial_clustering_class, initial_clustering_params, self.device, - random_state) + random_state, + log_fn=self._log_history) self.labels_ = kmeans_labels self.cluster_centers_ = kmeans_centers self.dcn_labels_ = dcn_labels diff --git a/clustpy/deep/ddc_n2d.py b/clustpy/deep/ddc_n2d.py index 6cde45b..e1dd995 100644 --- a/clustpy/deep/ddc_n2d.py +++ b/clustpy/deep/ddc_n2d.py @@ -6,7 +6,7 @@ import torch import numpy as np from clustpy.deep._utils import detect_device, encode_batchwise, run_initial_clustering, mean_squared_error -from clustpy.deep._data_utils import get_train_and_test_dataloader +from clustpy.deep._data_utils import get_train_and_test_dataloader, get_dataloader from clustpy.deep._train_utils import get_trained_network from clustpy.deep._abstract_deep_clustering_algo import _AbstractDeepClusteringAlgo from sklearn.manifold import TSNE @@ -18,13 +18,13 @@ from clustpy.utils.checks import check_parameters -def _manifold_based_sequential_dc(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict, +def _manifold_based_sequential_dc(X: np.ndarray, val_set: np.ndarray | None, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict, pretrain_epochs: int, optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, neural_network: torch.nn.Module | tuple, neural_network_weights: str, embedding_size: int, custom_dataloaders: tuple, manifold_class: TransformerMixin, manifold_params: dict, clustering_class: ClusterMixin, clustering_params: dict, device: torch.device, - random_state: np.random.RandomState) -> ( + random_state: np.random.RandomState, log_fn: Callable | None ) -> ( int, np.ndarray, np.ndarray, torch.nn.Module, TransformerMixin): """ Execute a manifold-based sequential deep clustering procedure on the input data set. @@ -33,6 +33,8 @@ def _manifold_based_sequential_dc(X: np.ndarray, n_clusters: int, batch_size: in ---------- X : np.ndarray / torch.Tensor the given data set. Can be a np.ndarray or a torch.Tensor + val_set : np.ndarray / torch.Tensor | None + validation set (can be ignored) n_clusters : int number of clusters (can be None) batch_size : int @@ -69,6 +71,8 @@ def _manifold_based_sequential_dc(X: np.ndarray, n_clusters: int, batch_size: in The device on which to perform the computations random_state : np.random.RandomState use a fixed random state to get a repeatable solution + log_fn : Callable | None + function for logging training history values (e.g. loss values) during training Returns ------- @@ -83,12 +87,16 @@ def _manifold_based_sequential_dc(X: np.ndarray, n_clusters: int, batch_size: in # Get the device to train on device = detect_device(device) trainloader, testloader, _ = get_train_and_test_dataloader(X, batch_size, custom_dataloaders) + if val_set is not None: + valloader = get_dataloader(val_set, batch_size, shuffle=False) + else: + valloader = None # Get initial AE - neural_network = get_trained_network(trainloader, n_epochs=pretrain_epochs, + neural_network = get_trained_network(trainloader,valloader, n_epochs=pretrain_epochs, optimizer_params=pretrain_optimizer_params, optimizer_class=optimizer_class, device=device, ssl_loss_fn=ssl_loss_fn, embedding_size=embedding_size, neural_network=neural_network, neural_network_weights=neural_network_weights, - random_state=random_state) + log_fn=log_fn, random_state=random_state) # Encode data X_embed = encode_batchwise(testloader, neural_network) # Get possible input parameters of the manifold class @@ -135,7 +143,7 @@ class DDC_density_peak_clustering(ClusterMixin, BaseEstimator): def __init__(self, ratio: float): self.ratio = ratio - def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DDC_density_peak_clustering': + def fit(self, X: np.ndarray, val_set: np.ndarray = None, y: np.ndarray = None) -> 'DDC_density_peak_clustering': """ Initiate the actual clustering process on the input data set. The resulting cluster labels will be stored in the labels_ attribute. @@ -308,13 +316,14 @@ class DDC(_AbstractDeepClusteringAlgo): Knowledge-Based Systems 197 (2020): 105841. """ - def __init__(self, ratio: float = 0.1, batch_size: int = 256, pretrain_optimizer_params: dict = None, + def __init__(self,n_clusters: int = None, ratio: float = 0.1, batch_size: int = 256, pretrain_optimizer_params: dict = None, pretrain_epochs: int = 100, optimizer_class: torch.optim.Optimizer = torch.optim.Adam, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error, neural_network: torch.nn.Module | tuple = None, neural_network_weights: str = None, embedding_size: int = 10, custom_dataloaders: tuple = None, tsne_params: dict = None, device: torch.device = None, random_state: np.random.RandomState | int = None): super().__init__(batch_size, neural_network, neural_network_weights, embedding_size, device, random_state) + self.n_clusters = n_clusters self.ratio = ratio self.pretrain_optimizer_params = pretrain_optimizer_params self.pretrain_epochs = pretrain_epochs @@ -323,7 +332,7 @@ def __init__(self, ratio: float = 0.1, batch_size: int = 256, pretrain_optimizer self.custom_dataloaders = custom_dataloaders self.tsne_params = tsne_params - def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DDC': + def fit(self, X: np.ndarray, val_set: np.ndarray = None, y: np.ndarray = None) -> 'DDC': """ Initiate the actual clustering process on the input data set. The resulting cluster labels will be stored in the labels_ attribute. @@ -344,7 +353,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DDC': tsne_params = {"n_components": 2} if self.tsne_params is None else self.tsne_params if self.ratio > 1: print("[WARNING] ratio for DDC algorithm has been set to a value > 1 which can cause poor results") - n_clusters, labels, centers_ae, _, neural_network, tsne = _manifold_based_sequential_dc(X, None, self.batch_size, + n_clusters, labels, centers_ae, _, neural_network, tsne = _manifold_based_sequential_dc(X,val_set, self.n_clusters, self.batch_size, pretrain_optimizer_params, self.pretrain_epochs, self.optimizer_class, @@ -356,7 +365,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DDC': tsne_params, DDC_density_peak_clustering, {"ratio": self.ratio}, self.device, - random_state) + random_state,self._log_history) self.labels_ = labels self.n_clusters_ = n_clusters self.cluster_centers_ = centers_ae @@ -470,7 +479,7 @@ def __init__(self, n_clusters: int = 8, batch_size: int = 256, pretrain_optimize self.manifold_params = manifold_params self.initial_clustering_params = initial_clustering_params - def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'N2D': + def fit(self, X: np.ndarray, val_set: np.ndarray = None, y: np.ndarray = None) -> 'N2D': """ Initiate the actual clustering process on the input data set. The resulting cluster labels will be stored in the labels_ attribute. @@ -479,6 +488,8 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'N2D': ---------- X : np.ndarray the given data set + val_set : np.ndarray + validation set (can be ignored) y : np.ndarray the labels (can be ignored) @@ -489,7 +500,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'N2D': """ X, _, random_state, pretrain_optimizer_params, _, initial_clustering_params = self._check_parameters(X, y=y) manifold_params = {"n_components": self.n_clusters} if self.manifold_params is None else self.manifold_params - _, labels, centers_ae, centers_manifold, neural_network, manifold = _manifold_based_sequential_dc(X, self.n_clusters, + _, labels, centers_ae, centers_manifold, neural_network, manifold = _manifold_based_sequential_dc(X, val_set, self.n_clusters, self.batch_size, pretrain_optimizer_params, self.pretrain_epochs, @@ -503,7 +514,8 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'N2D': manifold_params, GMM, initial_clustering_params, self.device, - random_state) + random_state, + self._log_history) self.labels_ = labels.astype(np.int32) self.cluster_centers_manifold_ = centers_manifold self.cluster_centers_ = centers_ae diff --git a/clustpy/deep/dec.py b/clustpy/deep/dec.py index a2501fb..342ec81 100644 --- a/clustpy/deep/dec.py +++ b/clustpy/deep/dec.py @@ -16,13 +16,14 @@ from collections.abc import Callable# -def _dec(X: np.ndarray, n_clusters: int, alpha: float, batch_size: int, pretrain_optimizer_params: dict, +def _dec(X: np.ndarray,val_set: np.ndarray | None, n_clusters: int, alpha: float, batch_size: int, pretrain_optimizer_params: dict, clustering_optimizer_params: dict, pretrain_epochs: int, clustering_epochs: int, optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, neural_network: torch.nn.Module | tuple, neural_network_weights: str, embedding_size: int, clustering_loss_weight: float, ssl_loss_weight: float, custom_dataloaders: tuple, augmentation_invariance: bool, initial_clustering_class: ClusterMixin, initial_clustering_params: dict, - device: torch.device, random_state: np.random.RandomState) -> ( + device: torch.device, random_state: np.random.RandomState, + log_fn: Callable | None) -> ( np.ndarray, np.ndarray, np.ndarray, np.ndarray, torch.nn.Module): """ Start the actual DEC clustering procedure on the input data set. @@ -31,6 +32,8 @@ def _dec(X: np.ndarray, n_clusters: int, alpha: float, batch_size: int, pretrain ---------- X : np.ndarray / torch.Tensor the given data set. Can be a np.ndarray or a torch.Tensor + val_set : np.ndarray | None + Optional validation set for early stopping. If not None, Early stopping will be used n_clusters : int number of clusters. Can be None if a corresponding initial_clustering_class is given, that can determine the number of clusters, e.g. DBSCAN alpha : float @@ -88,11 +91,11 @@ def _dec(X: np.ndarray, n_clusters: int, alpha: float, batch_size: int, pretrain """ # Get initial setting (device, dataloaders, pretrained AE and initial clustering result) device, trainloader, testloader, _, neural_network, _, n_clusters, _, init_centers, _ = get_default_deep_clustering_initialization( - X, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn, + X, val_set, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn, neural_network, embedding_size, custom_dataloaders, initial_clustering_class, initial_clustering_params, device, - random_state, neural_network_weights=neural_network_weights) + random_state, log_fn=log_fn, neural_network_weights=neural_network_weights) # Setup DEC Module - dec_module = _DEC_Module(init_centers, alpha, augmentation_invariance).to(device) + dec_module = _DEC_Module(init_centers, alpha, augmentation_invariance,log_fn).to(device) # Use DEC optimizer parameters (usually learning rate is reduced by a magnitude of 10) optimizer = optimizer_class(list(neural_network.parameters()) + list(dec_module.parameters()), **clustering_optimizer_params) @@ -205,10 +208,11 @@ class _DEC_Module(torch.nn.Module): Is augmentation invariance used """ - def __init__(self, init_centers: np.ndarray, alpha: float, augmentation_invariance: bool = False): + def __init__(self, init_centers: np.ndarray, alpha: float, augmentation_invariance: bool = False,log_fn: Callable | None = None): super().__init__() self.alpha = alpha self.augmentation_invariance = augmentation_invariance + self.log_fn = log_fn # Centers are learnable parameters self.centers = torch.nn.Parameter(torch.tensor(init_centers), requires_grad=True) @@ -331,6 +335,8 @@ def _loss(self, batch: list, neural_network: torch.nn.Module, clustering_loss_we the final DEC loss """ loss = torch.tensor(0.).to(device) + ssl_loss = torch.tensor(0.).to(device) + cluster_loss = torch.tensor(0.).to(device) # Reconstruction loss is not included in DEC if ssl_loss_weight != 0: if self.augmentation_invariance: @@ -355,7 +361,7 @@ def _loss(self, batch: list, neural_network: torch.nn.Module, clustering_loss_we cluster_loss = self.dec_loss(embedded) loss += cluster_loss * clustering_loss_weight - return loss + return loss, ssl_loss, cluster_loss def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.DataLoader, n_epochs: int, device: torch.device, optimizer: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, @@ -390,16 +396,26 @@ def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.Dat tbar = tqdm.trange(n_epochs, desc="DEC training") for _ in tbar: total_loss = 0 + total_ssl_loss = 0 + total_cluster_loss = 0 for batch in trainloader: loss = self._loss(batch, neural_network, clustering_loss_weight, ssl_loss_weight, ssl_loss_fn, device) - total_loss += loss.item() + total_loss += loss[0].item() + total_ssl_loss += loss[1].item() if ssl_loss_weight != 0 else 0 + total_cluster_loss += loss[2].item() + # Backward pass optimizer.zero_grad() - loss.backward() + loss[0].backward() optimizer.step() postfix_str = {"Loss": total_loss} tbar.set_postfix(postfix_str) + if self.log_fn is not None: + self.log_fn("Total Loss", total_loss) + if ssl_loss_weight != 0: + self.log_fn("SSL Loss", total_ssl_loss) + self.log_fn("Clustering Loss", total_cluster_loss) return self @@ -511,7 +527,7 @@ def __init__(self, n_clusters: int = 8, alpha: float = 1.0, batch_size: int = 25 self.initial_clustering_class = initial_clustering_class self.initial_clustering_params = initial_clustering_params - def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DEC': + def fit(self, X: np.ndarray,val_set: np.ndarray | None = None, y: np.ndarray = None) -> 'DEC': """ Initiate the actual clustering process on the input data set. The resulting cluster labels will be stored in the labels_ attribute. @@ -520,6 +536,9 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DEC': ---------- X : np.ndarray the given data set + val_set : np.ndarray | None + optional validation set for monitoring purposes (can be ignored) + y : np.ndarray the labels (can be ignored) @@ -530,7 +549,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DEC': """ ssl_loss_weight = self.ssl_loss_weight if hasattr(self, "ssl_loss_weight") else 0 # DEC does not use ssl loss when clustering X, _, random_state, pretrain_optimizer_params, clustering_optimizer_params, initial_clustering_params = self._check_parameters(X, y=y) - kmeans_labels, kmeans_centers, dec_labels, dec_centers, neural_network = _dec(X, self.n_clusters, self.alpha, + kmeans_labels, kmeans_centers, dec_labels, dec_centers, neural_network = _dec(X,val_set, self.n_clusters, self.alpha, self.batch_size, pretrain_optimizer_params, clustering_optimizer_params, @@ -547,7 +566,8 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DEC': self.augmentation_invariance, self.initial_clustering_class, initial_clustering_params, - self.device, random_state) + self.device, random_state, + log_fn=self._log_history) self.labels_ = kmeans_labels self.cluster_centers_ = kmeans_centers self.dec_labels_ = dec_labels diff --git a/clustpy/deep/deepect.py b/clustpy/deep/deepect.py index 028afbb..364ec2e 100644 --- a/clustpy/deep/deepect.py +++ b/clustpy/deep/deepect.py @@ -79,10 +79,12 @@ class _DeepECT_Module(torch.nn.Module): augmentation_invariance : bool If True, augmented samples provided in custom_dataloaders[0] will be used to learn cluster assignments that are invariant to the augmentation transformations (default: False) + log_fn : Callable | None + function for logging training history values (e.g. loss values) during training """ def __init__(self, cluster_tree: BinaryClusterTree, max_n_leaf_nodes: int, grow_interval: int, - pruning_threshold: float, augmentation_invariance: bool = False): + pruning_threshold: float, augmentation_invariance: bool = False,log_fn: Callable | None = None): super().__init__() # Create initial cluster tree self.cluster_tree = cluster_tree @@ -90,6 +92,7 @@ def __init__(self, cluster_tree: BinaryClusterTree, max_n_leaf_nodes: int, grow_ self.grow_interval = grow_interval self.pruning_threshold = pruning_threshold self.augmentation_invariance = augmentation_invariance + self.log_fn = log_fn def predict_hard(self, embedded: torch.Tensor) -> torch.Tensor: """ @@ -137,7 +140,7 @@ def _get_labels_from_leafs(self, embedded: torch.Tensor, leaf_nodes: list) -> ( leaf_labels = torch.stack([leaf.torch_labels[0] for leaf in leaf_nodes]) # Get distances between points and centers. Get nearest center squared_diffs = squared_euclidean_distance(embedded, leaf_centers) - cluster_center_assignments = (squared_diffs.min(dim=1)[1]).int() + cluster_center_assignments = (squared_diffs.min(dim=1)[1]).long() labels = leaf_labels[cluster_center_assignments] return leaf_centers, cluster_center_assignments, labels @@ -372,7 +375,7 @@ def _loss(self, batch: list, neural_network: torch.nn.Module, ssl_loss_fn: Calla dc_loss = self._data_compression_loss(embedded, split_nodes, labels, device, embedded_aug) # Combine losses loss = clustering_loss_weight * (nc_loss + dc_loss) + ssl_loss_weight * ssl_loss - return loss, labels + return (loss, ssl_loss, nc_loss+dc_loss), labels def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.DataLoader, testloader: torch.utils.data.DataLoader, n_epochs: int, device: torch.device, @@ -415,6 +418,8 @@ def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.Dat for epoch in tbar: # Update Network total_loss = 0 + total_ssl_loss = 0 + total_clust_loss = 0 with torch.no_grad(): # Grow tree if (epoch % self.grow_interval == 0 or self.cluster_tree.n_leaf_nodes_ < 2) and len( @@ -426,10 +431,12 @@ def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.Dat # Calculate loss loss, labels = self._loss(batch, neural_network, ssl_loss_fn, clustering_loss_weight, ssl_loss_weight, leaf_nodes, split_nodes, device) - total_loss += loss.item() + total_loss += loss[0].item() + total_ssl_loss += loss[1].item() + total_clust_loss += loss[2].item() # Backward pass - update weights optimizer.zero_grad() - loss.backward() + loss[0].backward() optimizer.step() # Adapt centers and weights of split nodes analytically with torch.no_grad(): @@ -440,16 +447,20 @@ def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.Dat leaf_nodes, split_nodes = self.cluster_tree.get_leaf_and_split_nodes() postfix_str = {"Loss": total_loss} tbar.set_postfix(postfix_str) + if self.log_fn is not None: + self.log_fn("Total Loss", total_loss) + self.log_fn("SSL Loss", total_ssl_loss) + self.log_fn("Clustering Loss", total_clust_loss) return self -def _deep_ect(X: np.ndarray, max_n_leaf_nodes: int, batch_size: int, pretrain_optimizer_params: dict, +def _deep_ect(X: np.ndarray, val_set: np.ndarray | None, max_n_leaf_nodes: int, batch_size: int, pretrain_optimizer_params: dict, clustering_optimizer_params: dict, pretrain_epochs: int, clustering_epochs: int, grow_interval: int, pruning_threshold: float, optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, neural_network: torch.nn.Module | tuple, neural_network_weights: str, embedding_size: int, clustering_loss_weight: float, ssl_loss_weight: float, custom_dataloaders: tuple, augmentation_invariance: bool, device: torch.device, - random_state: np.random.RandomState) -> (np.ndarray, np.ndarray, torch.nn.Module): + log_fn: Callable | None,random_state: np.random.RandomState) -> (np.ndarray, np.ndarray, torch.nn.Module): """ Start the actual DeepECT clustering procedure on the input data set. @@ -457,6 +468,8 @@ def _deep_ect(X: np.ndarray, max_n_leaf_nodes: int, batch_size: int, pretrain_op ---------- X : np.ndarray The given data set. Can be a np.ndarray or a torch.Tensor + val_set : np.ndarray + validation set (can be ignored) max_n_leaf_nodes : int Maximum number of leaf nodes in the cluster tree batch_size : int @@ -497,6 +510,8 @@ def _deep_ect(X: np.ndarray, max_n_leaf_nodes: int, batch_size: int, pretrain_op If True, augmented samples provided in custom_dataloaders[0] will be used to learn cluster assignments that are invariant to the augmentation transformations device : torch.device The device on which to perform the computations + log_fn : Callable | None + function for logging training history values (e.g. loss values) during training random_state : np.random.RandomState use a fixed random state to get a repeatable solution @@ -509,13 +524,13 @@ def _deep_ect(X: np.ndarray, max_n_leaf_nodes: int, batch_size: int, pretrain_op """ # Get initial setting (device, dataloaders, pretrained AE and initial clustering result) device, trainloader, testloader, _, neural_network, _, _, _, init_leafnode_centers, _ = get_default_deep_clustering_initialization( - X, 2, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn, + X, val_set, 2, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn, neural_network, embedding_size, custom_dataloaders, KMeans, {"n_init": 20}, device, - random_state, neural_network_weights=neural_network_weights) + random_state, log_fn=log_fn, neural_network_weights=neural_network_weights) cluster_tree = BinaryClusterTree(_DeepECT_ClusterTreeNode) # Setup DeepECT Module deepect_module = _DeepECT_Module(cluster_tree, max_n_leaf_nodes, grow_interval, pruning_threshold, - augmentation_invariance).to(device) + augmentation_invariance,log_fn).to(device) # Use DeepECT optimizer parameters (usually learning rate is reduced by a magnitude of 10) optimizer = optimizer_class(list(neural_network.parameters()), **clustering_optimizer_params) # DeepECT Training loop @@ -626,7 +641,7 @@ def __init__(self, max_n_leaf_nodes: int = 20, batch_size: int = 256, pretrain_o self.custom_dataloaders = custom_dataloaders self.augmentation_invariance = augmentation_invariance - def fit(self, X: np.ndarray, y: np.ndarray = None) -> "DeepECT": + def fit(self, X: np.ndarray, val_set: np.ndarray = None, y: np.ndarray = None) -> "DeepECT": """ Initiate the actual clustering process on the input data set. The resulting cluster labels will be stored in the labels_ attribute. @@ -635,6 +650,8 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> "DeepECT": ---------- X : np.ndarray the given data set + val_set : np.ndarray + validation set (can be ignored) y : np.ndarray the labels (can be ignored) @@ -644,14 +661,14 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> "DeepECT": This instance of the DeepECT algorithm """ X, _, random_state, pretrain_optimizer_params, clustering_optimizer_params, _ = self._check_parameters(X, y=y) - tree, labels, neural_network = _deep_ect(X, self.max_n_leaf_nodes, self.batch_size, + tree, labels, neural_network = _deep_ect(X, val_set, self.max_n_leaf_nodes, self.batch_size, pretrain_optimizer_params, clustering_optimizer_params, self.pretrain_epochs, self.clustering_epochs, self.grow_interval, self.pruning_threshold, self.optimizer_class, self.ssl_loss_fn, self.neural_network, self.neural_network_weights, self.embedding_size, self.clustering_loss_weight, self.ssl_loss_weight, self.custom_dataloaders, self.augmentation_invariance, self.device, - random_state) + self._log_history, random_state) self.tree_ = tree self.labels_ = labels self.neural_network_trained_ = neural_network diff --git a/clustpy/deep/den.py b/clustpy/deep/den.py index 7306175..80336b0 100644 --- a/clustpy/deep/den.py +++ b/clustpy/deep/den.py @@ -229,7 +229,7 @@ def _loss(self, batch: list, group_size: list, neural_network: torch.nn.Module, # Calculate group sparsity constraint group_sparsity_loss = self._group_sparsity_loss(embedded, group_size) loss = ssl_loss + self.weight_locality_constraint * locality_preserving_loss + self.weight_sparsity_constraint * group_sparsity_loss - return loss + return loss, ssl_loss, locality_preserving_loss, group_sparsity_loss def _get_nearest_neighbors(self, X: np.ndarray) -> list: @@ -294,15 +294,25 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DEN': for _ in tbar: # Update Network total_loss = 0 + total_ssl_loss = 0 + total_locality_loss = 0 + total_sparsity_loss = 0 for batch in trainloader: loss = self._loss(batch, group_size, neural_network, device) - total_loss += loss.item() + total_loss += loss[0].item() + total_ssl_loss += loss[1].item() + total_locality_loss += loss[2].item() + total_sparsity_loss += loss[3].item() # Backward pass - update weights optimizer.zero_grad() - loss.backward() + loss[0].backward() optimizer.step() postfix_str = {"Loss": total_loss} tbar.set_postfix(postfix_str) + self._log_history("Total Loss", total_loss) + self._log_history("SSL Loss", total_ssl_loss) + self._log_history("Locality Loss", total_locality_loss) + self._log_history("Sparsity Loss", total_sparsity_loss) # Execute clustering with Kmeans embedded_data = encode_batchwise(testloader, neural_network) kmeans = KMeans(n_clusters=self.n_clusters, random_state=random_state) @@ -312,4 +322,5 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DEN': self.cluster_centers_ = kmeans.cluster_centers_ self.neural_network_trained_ = neural_network self.set_n_featrues_in(X) + return self diff --git a/clustpy/deep/dipdeck.py b/clustpy/deep/dipdeck.py index 8433330..7d9065b 100644 --- a/clustpy/deep/dipdeck.py +++ b/clustpy/deep/dipdeck.py @@ -469,7 +469,7 @@ def _loss(self, batch: list, neural_network: torch.nn.Module, ssl_loss_fn: Calla loss = ssl_loss_weight * ssl_loss + clustering_loss_weight * cluster_loss return loss - def fit(self, X, neural_network: torch.nn.Module, trainloader: torch.utils.data.DataLoader, + def fit(self, X, val_set: np.ndarray | None, neural_network: torch.nn.Module, trainloader: torch.utils.data.DataLoader, testloader: torch.utils.data.DataLoader, n_epochs: int, device: torch.device, optimizer: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, clustering_loss_weight: float, ssl_loss_weight: float, debug: bool) -> '_DipDECK_Module': diff --git a/clustpy/deep/dipencoder.py b/clustpy/deep/dipencoder.py index bf6774e..4d82748 100644 --- a/clustpy/deep/dipencoder.py +++ b/clustpy/deep/dipencoder.py @@ -785,7 +785,7 @@ def __init__(self, n_clusters: int = 8, batch_size: int = None, pretrain_optimiz self.initial_clustering_class = initial_clustering_class self.initial_clustering_params = initial_clustering_params - def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DipEncoder': + def fit(self, X: np.ndarray, val_set: np.ndarray = None, y: np.ndarray = None) -> 'DipEncoder': """ Initiate the actual clustering/dimensionality reduction process on the input data set. If no ground truth labels are given, the resulting cluster labels will be stored in the labels_ attribute. @@ -794,6 +794,8 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DipEncoder': ---------- X : np.ndarray The given (training) data set + val_set : np.ndarray + The validation data set (not used in DipEncoder, included for compatibility reasons) (default: None) y : np.ndarray The ground truth labels. If None, the DipEncoder will be used for clustering (default: None) @@ -807,7 +809,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DipEncoder': batch_size = 25 * self.n_clusters if self.batch_size is None else self.batch_size # Get initial setting (device, dataloaders, pretrained AE and initial clustering result) device, trainloader, testloader, _, neural_network, X_embed, n_clusters, init_labels, init_centers, _ = get_default_deep_clustering_initialization( - X, self.n_clusters, batch_size, pretrain_optimizer_params, self.pretrain_epochs, self.optimizer_class, self.ssl_loss_fn, + X, val_set, self.n_clusters, batch_size, pretrain_optimizer_params, self.pretrain_epochs, self.optimizer_class, self.ssl_loss_fn, self.neural_network, self.embedding_size, self.custom_dataloaders, self.initial_clustering_class if y is None else None, initial_clustering_params, self.device, random_state, neural_network_weights=self.neural_network_weights) if y is not None: diff --git a/clustpy/deep/dkm.py b/clustpy/deep/dkm.py index 896a6f3..65426ad 100644 --- a/clustpy/deep/dkm.py +++ b/clustpy/deep/dkm.py @@ -14,13 +14,13 @@ from collections.abc import Callable -def _dkm(X: np.ndarray, n_clusters: int, alphas: list | tuple, batch_size: int, pretrain_optimizer_params: dict, +def _dkm(X: np.ndarray, val_set: np.ndarray | None, n_clusters: int, alphas: list | tuple, batch_size: int, pretrain_optimizer_params: dict, clustering_optimizer_params: dict, pretrain_epochs: int, clustering_epochs: int, optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, neural_network: torch.nn.Module | tuple, neural_network_weights: str, embedding_size: int, clustering_loss_weight: float, ssl_loss_weight: float, custom_dataloaders: tuple, augmentation_invariance: bool, initial_clustering_class: ClusterMixin, - initial_clustering_params: dict, device: torch.device, random_state: np.random.RandomState) -> ( + initial_clustering_params: dict, device: torch.device, random_state: np.random.RandomState,log_fn: Callable | None) -> ( np.ndarray, np.ndarray, np.ndarray, np.ndarray, torch.nn.Module): """ Start the actual DKM clustering procedure on the input data set. @@ -29,6 +29,8 @@ def _dkm(X: np.ndarray, n_clusters: int, alphas: list | tuple, batch_size: int, ---------- X : np.ndarray / torch.Tensor the given data set. Can be a np.ndarray or a torch.Tensor + val_set : np.ndarray + validation set (can be ignored) n_clusters : int number of clusters. Can be None if a corresponding initial_clustering_class is given, that can determine the number of clusters, e.g. DBSCAN alphas : list | tuple @@ -88,11 +90,11 @@ def _dkm(X: np.ndarray, n_clusters: int, alphas: list | tuple, batch_size: int, """ # Get initial setting (device, dataloaders, pretrained AE and initial clustering result) device, trainloader, testloader, _, neural_network, _, n_clusters, _, init_centers, _ = get_default_deep_clustering_initialization( - X, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn, + X, val_set, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn, neural_network, embedding_size, custom_dataloaders, initial_clustering_class, initial_clustering_params, device, - random_state, neural_network_weights=neural_network_weights) + random_state, log_fn=log_fn, neural_network_weights=neural_network_weights) # Setup DKM Module - dkm_module = _DKM_Module(init_centers, alphas, augmentation_invariance).to(device) + dkm_module = _DKM_Module(init_centers, alphas, augmentation_invariance,log_fn).to(device) # Use DKM optimizer parameters (usually learning rate is reduced by a magnitude of 10) optimizer = optimizer_class(list(neural_network.parameters()) + list(dkm_module.parameters()), **clustering_optimizer_params) @@ -183,10 +185,12 @@ class _DKM_Module(torch.nn.Module): Is augmentation invariance used """ - def __init__(self, init_centers: np.ndarray, alphas: list, augmentation_invariance: bool = False): + def __init__(self, init_centers: np.ndarray, alphas: list, + augmentation_invariance: bool = False, log_fn: Callable | None = None): super().__init__() self.alphas = alphas self.augmentation_invariance = augmentation_invariance + self.log_fn = log_fn # Centers are learnable parameters self.centers = torch.nn.Parameter(torch.tensor(init_centers), requires_grad=True) @@ -320,7 +324,7 @@ def _loss(self, batch: list, alpha: float, neural_network: torch.nn.Module, clus # Calculate clustering loss cluster_loss = self.dkm_loss(embedded, alpha) loss = ssl_loss_weight * ssl_loss + cluster_loss * clustering_loss_weight - return loss + return loss, ssl_loss, cluster_loss def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.DataLoader, n_epochs: int, device: torch.device, optimizer: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, @@ -357,17 +361,27 @@ def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.Dat for alpha in self.alphas: for _ in range(n_epochs): total_loss = 0 + total_ssl_loss = 0 + total_cluster_loss = 0 for batch in trainloader: loss = self._loss(batch, alpha, neural_network, clustering_loss_weight, ssl_loss_weight, ssl_loss_fn, device) - total_loss += loss.item() + total_loss += loss[0].item() + total_ssl_loss += loss[1].item() + total_cluster_loss += loss[2].item() # Backward pass optimizer.zero_grad() - loss.backward() + loss[0].backward() optimizer.step() postfix_str = {"Loss": total_loss, "Alpha": alpha} tbar.set_postfix(postfix_str) tbar.update() + if self.log_fn is not None: + self.log_fn("Alpha completed", alpha) + self.log_fn("Total Loss", total_loss) + self.log_fn("SSL Loss", total_ssl_loss) + self.log_fn("Clustering Loss", total_cluster_loss) + return self @@ -506,7 +520,7 @@ def _check_alphas(self) -> list: assert type(alphas) is tuple or type(alphas) is list, "alphas must be a list, int or tuple" return alphas - def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DKM': + def fit(self, X: np.ndarray, val_set: np.ndarray = None, y: np.ndarray = None) -> 'DKM': """ Initiate the actual clustering process on the input data set. The resulting cluster labels will be stored in the labels_ attribute. @@ -515,6 +529,8 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DKM': ---------- X : np.ndarray the given data set + val_set : np.ndarray + validation set (can be ignored) y : np.ndarray the labels (can be ignored) @@ -525,7 +541,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DKM': """ X, _, random_state, pretrain_optimizer_params, clustering_optimizer_params, initial_clustering_params = self._check_parameters(X, y=y) alphas = self._check_alphas() - kmeans_labels, kmeans_centers, dkm_labels, dkm_centers, neural_network = _dkm(X, self.n_clusters, alphas, + kmeans_labels, kmeans_centers, dkm_labels, dkm_centers, neural_network = _dkm(X, val_set, self.n_clusters, alphas, self.batch_size, pretrain_optimizer_params, clustering_optimizer_params, @@ -543,7 +559,8 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'DKM': self.initial_clustering_class, initial_clustering_params, self.device, - random_state) + random_state, + self._log_history) self.labels_ = kmeans_labels self.cluster_centers_ = kmeans_centers self.dkm_labels_ = dkm_labels diff --git a/clustpy/deep/neural_networks/_abstract_autoencoder.py b/clustpy/deep/neural_networks/_abstract_autoencoder.py index f582157..86e661f 100644 --- a/clustpy/deep/neural_networks/_abstract_autoencoder.py +++ b/clustpy/deep/neural_networks/_abstract_autoencoder.py @@ -118,6 +118,9 @@ def __init__(self, work_on_copy: bool = True, random_state: np.random.RandomStat self.random_state = random_state self.fitted = False self.allow_nd_input = False + rs = check_random_state(self.random_state) + set_torch_seed(rs) + print("Ich bin da _AbstractAutoencoder") def encode(self, x: torch.Tensor) -> torch.Tensor: """ @@ -266,7 +269,7 @@ def evaluate(self, dataloader: torch.utils.data.DataLoader, ssl_loss_fn: Callabl """ with torch.no_grad(): self.eval() - loss = torch.tensor(0.) + loss = torch.tensor(0.0,device=device) for batch in dataloader: new_loss, _, _ = self.loss(batch, ssl_loss_fn, device) loss += new_loss @@ -279,7 +282,7 @@ def fit(self, n_epochs: int = 100, optimizer_params: dict = None, batch_size: in optimizer_class: torch.optim.Optimizer = torch.optim.Adam, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error, patience: int = 5, scheduler: torch.optim.lr_scheduler = None, scheduler_params: dict = {}, - corruption_fn: Callable = None, model_path: str = None) -> '_AbstractAutoencoder': + corruption_fn: Callable = None, model_path: str = None,log_fn: Callable[[str, float], None] = None) -> '_AbstractAutoencoder': """ Trains the autoencoder in place. @@ -316,7 +319,8 @@ def fit(self, n_epochs: int = 100, optimizer_params: dict = None, batch_size: in For example, if the data is normalized, this may have to be taken into account in the corruption function - e.g. in case of salt and pepper noise (default: None) model_path : str if specified will save the trained model to the location. If evalloader is used, then only the best model w.r.t. evaluation loss is saved (default: None) - + log_fn : Callable[[str, float], None] + function that takes a string and a float as input and logs the training process (default: None) Returns ------- self : _AbstractAutoencoder @@ -355,6 +359,7 @@ def fit(self, n_epochs: int = 100, optimizer_params: dict = None, batch_size: in # training loop device = get_device_from_module(self) tbar = tqdm.trange(n_epochs, desc="AE training") + for epoch_i in tbar: self.train() total_loss = 0 @@ -382,8 +387,16 @@ def fit(self, n_epochs: int = 100, optimizer_params: dict = None, batch_size: in self.save_parameters(model_path) if early_stopping.early_stop: print(f"Stop training at epoch {best_epoch}. Best Loss: {best_loss:.6f}, Last Loss: {val_loss:.6f}") + break if scheduler is not None and eval_step_scheduler: scheduler.step(val_loss) + if log_fn is not None: + log_fn("pretrain/Eval Loss", val_loss.item()) + if log_fn is not None: + if evalloader is not None: + log_fn("pretrain/Eval Loss", val_loss.item()) + log_fn("pretrain/Train Loss", total_loss) + tbar.set_postfix(postfix_str) # change to eval mode after training self.eval() diff --git a/clustpy/deep/vade.py b/clustpy/deep/vade.py index b5d6e79..06ccc23 100644 --- a/clustpy/deep/vade.py +++ b/clustpy/deep/vade.py @@ -17,13 +17,14 @@ from collections.abc import Callable -def _vade(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict, +def _vade(X: np.ndarray, val_set: np.ndarray | None, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict, clustering_optimizer_params: dict, pretrain_epochs: int, clustering_epochs: int, optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, neural_network: torch.nn.Module | tuple, neural_network_weights: str, embedding_size: int, clustering_loss_weight: float, ssl_loss_weight: float, custom_dataloaders: tuple, initial_clustering_class: ClusterMixin, initial_clustering_params: dict, - device: torch.device, random_state: np.random.RandomState) -> ( + device: torch.device, random_state: np.random.RandomState, + log_fn: Callable | None) -> ( np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, torch.nn.Module): """ Start the actual VaDE clustering procedure on the input data set. @@ -32,6 +33,8 @@ def _vade(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_pa ---------- X : np.ndarray / torch.Tensor the given data set. Can be a np.ndarray or a torch.Tensor + val_set : np.ndarray / torch.Tensor | None + validation set (can be ignored) n_clusters : int number of clusters. Can be None if a corresponding initial_clustering_class is given, that can determine the number of clusters, e.g. DBSCAN batch_size : int @@ -86,15 +89,15 @@ def _vade(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_pa """ # Get initial setting (device, dataloaders, pretrained AE and initial clustering result) device, trainloader, testloader, _, neural_network, _, n_clusters, init_labels, init_means, init_clustering_algo = get_default_deep_clustering_initialization( - X, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn, + X, val_set, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn, neural_network, embedding_size, custom_dataloaders, initial_clustering_class, initial_clustering_params, device, - random_state, _VaDE_VAE, neural_network_weights=neural_network_weights) + random_state, _VaDE_VAE, log_fn=log_fn, neural_network_weights=neural_network_weights) # Get parameters from initial clustering algorithm init_weights = None if not hasattr(init_clustering_algo, "weights_") else init_clustering_algo.weights_ init_covs = None if not hasattr(init_clustering_algo, "covariances_") else init_clustering_algo.covariances_ # Initialize VaDE vade_module = _VaDE_Module(n_clusters=n_clusters, embedding_size=embedding_size, weights=init_weights, - means=init_means, variances=init_covs).to(device) + means=init_means, variances=init_covs,log_fn=log_fn).to(device) # Use vade learning_rate (usually pretrain_optimizer_params reduced by a magnitude of 10) optimizer = optimizer_class(list(neural_network.parameters()) + list(vade_module.parameters()), **clustering_optimizer_params) @@ -238,7 +241,7 @@ class _VaDE_Module(torch.nn.Module): """ def __init__(self, n_clusters: int, embedding_size: int, weights: torch.Tensor = None, means: torch.Tensor = None, - variances: torch.Tensor = None): + variances: torch.Tensor = None, log_fn: Callable | None = None): super(_VaDE_Module, self).__init__() if weights is None: # if not initialized then use uniform distribution @@ -254,7 +257,7 @@ def __init__(self, n_clusters: int, embedding_size: int, weights: torch.Tensor = embedding_size), "Shape of the initial variances for the Vade_Module must be (n_clusters, embedding_size)" self.p_log_var = torch.nn.Parameter(torch.log(torch.tensor(variances)), requires_grad=True) self.normalize_prob = torch.nn.Softmax(dim=0) - + self.log_fn = log_fn def predict(self, q_mean: torch.Tensor, q_logvar: torch.Tensor) -> torch.Tensor: """ Predict the labels given the specific means and variances of given samples. @@ -305,9 +308,9 @@ def vade_loss(self, neural_network: VariationalAutoencoder, batch_data: torch.Te z, q_mean, q_logvar, reconstruction = neural_network.forward(batch_data) pi_normalized = self.normalize_prob(self.pi) p_c_z = _get_gamma(pi_normalized, self.p_mean, self.p_log_var, z) - loss = _compute_vade_loss(pi_normalized, self.p_mean, self.p_log_var, q_mean, q_logvar, batch_data, p_c_z, + loss, ssl_loss = _compute_vade_loss(pi_normalized, self.p_mean, self.p_log_var, q_mean, q_logvar, batch_data, p_c_z, reconstruction, ssl_loss_fn, clustering_loss_weight, ssl_loss_weight) - return loss + return loss, ssl_loss def fit(self, neural_network: VariationalAutoencoder, testloader: torch.utils.data.DataLoader, trainloader: torch.utils.data.DataLoader, n_epochs: int, device: torch.device, @@ -348,16 +351,21 @@ def fit(self, neural_network: VariationalAutoencoder, testloader: torch.utils.da for _ in tbar: self.train() total_loss = 0 + total_ssl_loss = 0 for batch in trainloader: # load batch on device batch_data = batch[1].to(device) loss = self.vade_loss(neural_network, batch_data, ssl_loss_fn, clustering_loss_weight, ssl_loss_weight) - total_loss += loss.item() + total_loss += loss[0].item() + total_ssl_loss += loss[1].item() optimizer.zero_grad() - loss.backward() + loss[0].backward() optimizer.step() postfix_str = {"Loss": total_loss} + if self.log_fn is not None: + self.log_fn("Total Loss", total_loss) + self.log_fn("SSL Loss", total_ssl_loss) tbar.set_postfix(postfix_str) return self @@ -478,7 +486,7 @@ def _compute_vade_loss(pi: torch.Tensor, p_mean: torch.Tensor, p_log_var: torch. loss = p_z_c - p_c - q_z_x + q_c_x loss /= batch_data.size(0) loss = clustering_loss_weight * loss + ssl_loss_weight * p_x_z # Beware that we do not divide two times by number of samples - return loss + return loss, p_x_z class VaDE(_AbstractDeepClusteringAlgo): @@ -589,7 +597,7 @@ def __init__(self, n_clusters: int = 8, batch_size: int = 256, pretrain_optimize self.initial_clustering_class = initial_clustering_class self.initial_clustering_params = initial_clustering_params - def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'VaDE': + def fit(self, X: np.ndarray, val_set: np.ndarray = None, y: np.ndarray = None) -> 'VaDE': """ Initiate the actual clustering process on the input data set. The resulting cluster labels will be stored in the labels_ attribute. @@ -598,6 +606,8 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'VaDE': ---------- X : np.ndarray the given data set + val_set : np.ndarray + validation set (can be ignored) y : np.ndarray the labels (can be ignored) @@ -613,6 +623,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'VaDE': "covariance_type": "diag"} if self.initial_clustering_params is None else self.initial_clustering_params gmm_labels, gmm_means, gmm_covariances, gmm_weights, vade_labels, vade_centers, vade_covariances, neural_network = _vade( X, + val_set, self.n_clusters, self.batch_size, pretrain_optimizer_params, @@ -630,7 +641,8 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'VaDE': self.initial_clustering_class, initial_clustering_params, self.device, - random_state) + random_state, + self._log_history) self.labels_ = gmm_labels self.cluster_centers_ = gmm_means self.covariances_ = gmm_covariances