Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ __pycache__/
*.py[cod]
*$py.class

testing_preprocessing.py
# C extensions
*.so
*.dll
Expand Down
15 changes: 15 additions & 0 deletions clustpy/deep/_abstract_deep_clustering_algo.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from collections import defaultdict
from clustpy.deep._utils import set_torch_seed
from sklearn.base import TransformerMixin, BaseEstimator, ClusterMixin
import numpy as np
Expand Down Expand Up @@ -37,6 +38,20 @@ def __init__(self, batch_size: int, neural_network: torch.nn.Module | tuple, neu
self.embedding_size = embedding_size
self.device = device
self.random_state = random_state
self.history_ = defaultdict(list)

def _log_history(self, key: str, value) -> None:
"""
Log pretraining and clustering history values (e.g. loss values) during training.

Parameters
----------
key : str
the key under which to store the value
value : float

"""
self.history_[key].append(float(value))

def _check_parameters(self, X: np.ndarray, *, y: np.ndarray=None) -> (np.ndarray, np.ndarray, np.random.RandomState, dict, dict, dict):
"""
Expand Down
28 changes: 22 additions & 6 deletions clustpy/deep/_train_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,13 +105,15 @@ def get_neural_network(input_dim: int, embedding_size: int = 10, neural_network:
return neural_network


def get_trained_network(trainloader: torch.utils.data.DataLoader = None, data: np.ndarray = None,
def get_trained_network(trainloader: torch.utils.data.DataLoader = None, evalloader: torch.utils.data.DataLoader | None = None, data: np.ndarray = None,
n_epochs: int = 100, batch_size: int = 128, optimizer_params: dict = None,
optimizer_class: torch.optim.Optimizer = torch.optim.Adam, device=None,
ssl_loss_fn: Callable | torch.nn.modules.loss._Loss = mean_squared_error, embedding_size: int = 10,
neural_network: torch.nn.Module | tuple = None,
neural_network_class: torch.nn.Module = FeedforwardAutoencoder,
neural_network_params: dict = None, neural_network_weights: str = None,
neural_network_params: dict = None,
log_fn: Callable[[str, float], None] = None,
neural_network_weights: str = None,
random_state: np.random.RandomState | int = None) -> torch.nn.Module:
"""This function returns a trained neural network. The following cases are considered
- If the neural network is initialized and trained (neural_network.fitted==True), then return input neural network without training it again.
Expand All @@ -123,6 +125,8 @@ def get_trained_network(trainloader: torch.utils.data.DataLoader = None, data: n
----------
trainloader : torch.utils.data.DataLoader
dataloader used to train neural_network (default: None)
evalloader : torch.utils.data.DataLoader | None
dataloader used for earlystopping during training (default: None)
data : np.ndarray
train data set. If data is passed then trainloader can remain empty (default: None)
n_epochs : int
Expand All @@ -147,6 +151,8 @@ def get_trained_network(trainloader: torch.utils.data.DataLoader = None, data: n
The neural network class that should be used (default: FeedforwardAutoencoder)
neural_network_params : dict
Parameters to be used when creating a new neural network using the neural_network_class (default: None)
log_fn : Callable[[str, float], None]
Function to log pretraining information such as loss values. It has to take a string (key) and a float (value) as input parameters.
neural_network_weights : str
Path to a file containing the state_dict of the neural_network (default: None)
random_state : np.random.RandomState | int
Expand All @@ -170,12 +176,12 @@ def get_trained_network(trainloader: torch.utils.data.DataLoader = None, data: n
print("Neural network is not fitted yet, will be pretrained.")
# Pretrain neural network
optimizer_params = {"lr": 1e-3} if optimizer_params is None else optimizer_params
neural_network.fit(n_epochs=n_epochs, optimizer_params=optimizer_params, dataloader=trainloader,
optimizer_class=optimizer_class, ssl_loss_fn=ssl_loss_fn)
neural_network.fit(n_epochs=n_epochs, optimizer_params=optimizer_params, dataloader=trainloader, evalloader=evalloader,
optimizer_class=optimizer_class, ssl_loss_fn=ssl_loss_fn,log_fn=log_fn)
return neural_network


def get_default_deep_clustering_initialization(X: np.ndarray | torch.Tensor, n_clusters: int, batch_size: int,
def get_default_deep_clustering_initialization(X: np.ndarray | torch.Tensor, val_set: np.ndarray | torch.Tensor | None, n_clusters: int, batch_size: int,
pretrain_optimizer_params: dict, pretrain_epochs: int,
optimizer_class: torch.optim.Optimizer,
ssl_loss_fn: Callable | torch.nn.modules.loss._Loss,
Expand All @@ -185,6 +191,7 @@ def get_default_deep_clustering_initialization(X: np.ndarray | torch.Tensor, n_c
random_state: np.random.RandomState,
neural_network_class: torch.nn.Module = FeedforwardAutoencoder,
neural_network_params: dict = None,
log_fn: Callable[[str, float], None] = None,
neural_network_weights: str = None) -> (
torch.device, torch.utils.data.DataLoader, torch.utils.data.DataLoader, int, torch.nn.Module, np.ndarray, int,
np.ndarray, np.ndarray, ClusterMixin):
Expand All @@ -196,6 +203,8 @@ def get_default_deep_clustering_initialization(X: np.ndarray | torch.Tensor, n_c
----------
X : np.ndarray | torch.Tensor
the given data set. Can be a np.ndarray or a torch.Tensor
val_set : np.ndarray | torch.Tensor | None
validation data set. Can be a np.ndarray or a torch.Tensor. If None, no validation set will be used
n_clusters : int
number of clusters. Can be None if a corresponding initial_clustering_class is given, e.g. DBSCAN
batch_size : int
Expand Down Expand Up @@ -231,6 +240,8 @@ def get_default_deep_clustering_initialization(X: np.ndarray | torch.Tensor, n_c
The neural network class that should be used (default: FeedforwardAutoencoder)
neural_network_params : dict
Parameters to be used when creating a new neural network using the neural_network_class (default: None)
log_fn : Callable[[str, float], None]
Function to log pretraining information such as loss values. It has to take a string (key) and a float (value) as input parameters.
neural_network_weights : str
Path to a file containing the state_dict of the neural_network (default: None)

Expand All @@ -250,12 +261,17 @@ def get_default_deep_clustering_initialization(X: np.ndarray | torch.Tensor, n_c
"""
device = detect_device(device)
trainloader, testloader, batch_size = get_train_and_test_dataloader(X, batch_size, custom_dataloaders)
neural_network = get_trained_network(trainloader, n_epochs=pretrain_epochs,
if val_set is not None:
evalloader = get_dataloader(val_set, batch_size, shuffle=True)
else:
evalloader = None
neural_network = get_trained_network(trainloader, evalloader= evalloader, n_epochs=pretrain_epochs,
optimizer_params=pretrain_optimizer_params, optimizer_class=optimizer_class,
device=device, ssl_loss_fn=ssl_loss_fn, embedding_size=embedding_size,
neural_network=neural_network, neural_network_class=neural_network_class,
neural_network_params=neural_network_params,
neural_network_weights=neural_network_weights,
log_fn=log_fn,
random_state=random_state)
# Execute initial clustering in embedded space
embedded_data = encode_batchwise(testloader, neural_network)
Expand Down
39 changes: 28 additions & 11 deletions clustpy/deep/aec.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,14 @@
from collections.abc import Callable


def _aec(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict,
def _aec(X: np.ndarray, val_set: np.ndarray | None, n_clusters: int, batch_size: int, pretrain_optimizer_params: dict,
clustering_optimizer_params: dict, pretrain_epochs: int, clustering_epochs: int,
optimizer_class: torch.optim.Optimizer, ssl_loss_fn: Callable | torch.nn.modules.loss._Loss,
neural_network: torch.nn.Module | tuple, neural_network_weights: str,
embedding_size: int, clustering_loss_weight: float, ssl_loss_weight: float,
custom_dataloaders: tuple, augmentation_invariance: bool, initial_clustering_class: ClusterMixin,
initial_clustering_params: dict, device: torch.device,
random_state: np.random.RandomState) -> (np.ndarray, np.ndarray, np.ndarray, np.ndarray, torch.nn.Module):
log_fn: Callable | None,random_state: np.random.RandomState) -> (np.ndarray, np.ndarray, np.ndarray, np.ndarray, torch.nn.Module):
"""
Start the actual AEC clustering procedure on the input data set.

Expand All @@ -31,6 +31,8 @@ def _aec(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_par
the given data set. Can be a np.ndarray or a torch.Tensor
n_clusters : int
number of clusters. Can be None if a corresponding initial_clustering_class is given, that can determine the number of clusters, e.g. DBSCAN
val_set : np.ndarray | None
Optional validation set for early stopping. If not None, Early stopping will be used
batch_size : int
size of the data batches
pretrain_optimizer_params : dict
Expand Down Expand Up @@ -70,9 +72,13 @@ def _aec(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_par
parameters for the initial clustering class
device : torch.device
The device on which to perform the computations
log_fn : Callable | None
function for logging training history values (e.g. loss values) during training
random_state : np.random.RandomState
use a fixed random state to get a repeatable solution



Returns
-------
tuple : (np.ndarray, np.ndarray, np.ndarray, np.ndarray, torch.nn.Module)
Expand All @@ -82,11 +88,11 @@ def _aec(X: np.ndarray, n_clusters: int, batch_size: int, pretrain_optimizer_par
"""
# Get initial setting (device, dataloaders, pretrained AE and initial clustering result)
device, trainloader, testloader, _, neural_network, _, n_clusters, init_labels, init_centers, _ = get_default_deep_clustering_initialization(
X, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn,
X, val_set, n_clusters, batch_size, pretrain_optimizer_params, pretrain_epochs, optimizer_class, ssl_loss_fn,
neural_network, embedding_size, custom_dataloaders, initial_clustering_class, initial_clustering_params, device,
random_state, neural_network_weights=neural_network_weights)
random_state, log_fn=log_fn, neural_network_weights=neural_network_weights)
# Setup AEC Module
aec_module = _AEC_Module(init_labels, init_centers, augmentation_invariance).to_device(device)
aec_module = _AEC_Module(init_labels, init_centers, augmentation_invariance,log_fn).to_device(device)
# Use AEC optimizer parameters (usually learning rate is reduced by a magnitude of 10)
optimizer = optimizer_class(list(neural_network.parameters()), **clustering_optimizer_params)
# AEC Training loop
Expand Down Expand Up @@ -120,11 +126,13 @@ class _AEC_Module(_DCN_Module):
the cluster centers
augmentation_invariance : bool
Is augmentation invariance used
log_fn : Callable | None
function for logging training history values (e.g. loss values) during training
"""

def __init__(self, init_np_labels: np.ndarray, init_np_centers: np.ndarray,
augmentation_invariance: bool = False):
super().__init__(init_np_labels, init_np_centers, augmentation_invariance)
augmentation_invariance: bool = False, log_fn: Callable | None = None):
super().__init__(init_np_labels, init_np_centers, augmentation_invariance,log_fn)

def update_centroids(self, embedded: np.ndarray, labels: np.ndarray) -> torch.Tensor:
"""
Expand Down Expand Up @@ -188,14 +196,18 @@ def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.Dat
for _ in tbar:
# Update Network
total_loss = 0
total_ssl_loss = 0
total_clustering_loss = 0
for batch in trainloader:
# Beware that the clustering loss of DCN is divided by 2, therefore we use 2 * clustering_loss_weight
loss = self._loss(batch, neural_network, ssl_loss_fn, ssl_loss_weight,
2 * clustering_loss_weight, device)
total_loss += loss.item()
total_loss += loss[0].item()
total_ssl_loss += loss[1].item()
total_clustering_loss += loss[2].item()
# Backward pass - update weights
optimizer.zero_grad()
loss.backward()
loss[0].backward()
optimizer.step()
postfix_str = {"Loss": total_loss}
tbar.set_postfix(postfix_str)
Expand All @@ -207,6 +219,10 @@ def fit(self, neural_network: torch.nn.Module, trainloader: torch.utils.data.Dat
# update assignments
labels = self.predict_hard(torch.tensor(embedded).to(device))
self.labels = labels.to(device)
if self.log_fn is not None:
self.log_fn("Total Loss", total_loss)
self.log_fn("SSL Loss", total_ssl_loss)
self.log_fn("Clustering Loss", total_clustering_loss)
return self


Expand Down Expand Up @@ -315,7 +331,7 @@ def __init__(self, n_clusters: int = 8, batch_size: int = 256, pretrain_optimize
self.initial_clustering_class = initial_clustering_class
self.initial_clustering_params = initial_clustering_params

def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'AEC':
def fit(self, X: np.ndarray, val_set: np.ndarray = None, y: np.ndarray = None) -> 'AEC':
"""
Initiate the actual clustering process on the input data set.
The resulting cluster labels will be stored in the labels_ attribute.
Expand All @@ -333,7 +349,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'AEC':
this instance of the AEC algorithm
"""
X, _, random_state, pretrain_optimizer_params, clustering_optimizer_params, initial_clustering_params = self._check_parameters(X, y=y)
aec_labels, aec_centers, neural_network = _aec(X, self.n_clusters, self.batch_size,
aec_labels, aec_centers, neural_network = _aec(X, val_set, self.n_clusters, self.batch_size,
pretrain_optimizer_params,
clustering_optimizer_params,
self.pretrain_epochs,
Expand All @@ -349,6 +365,7 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'AEC':
self.initial_clustering_class,
initial_clustering_params,
self.device,
self._log_history,
random_state)
self.labels_ = aec_labels
self.cluster_centers_ = aec_centers
Expand Down
Loading